blob: f293cdbf938e1af62085a84c1d52fc26e4be3140 [file] [log] [blame]
reed@android.com7d2e3222009-07-30 02:22:31 +00001/*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 ** http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17
agl@chromium.org28bee952009-11-19 03:21:57 +000018#ifdef ANDROID
19 #include <machine/cpu-features.h>
20#endif
21
reed@android.com7d2e3222009-07-30 02:22:31 +000022#include "SkBlitRow.h"
reed@android.com6123e472009-08-04 01:52:27 +000023#include "SkColorPriv.h"
24#include "SkDither.h"
reed@android.com7d2e3222009-07-30 02:22:31 +000025
reed@android.com4e635f92009-10-19 17:39:46 +000026#if defined(__ARM_HAVE_NEON)
27#include <arm_neon.h>
28#endif
29
reed@android.coma98a21e2009-10-19 18:13:18 +000030#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com7d2e3222009-07-30 02:22:31 +000031static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
32 const SkPMColor* SK_RESTRICT src, int count,
33 U8CPU alpha, int /*x*/, int /*y*/) {
34 SkASSERT(255 == alpha);
reed@android.com229d9b32010-04-09 18:44:46 +000035
reed@android.com7d2e3222009-07-30 02:22:31 +000036 if (count >= 8) {
37 uint16_t* SK_RESTRICT keep_dst;
38
39 asm volatile (
40 "ands ip, %[count], #7 \n\t"
41 "vmov.u8 d31, #1<<7 \n\t"
42 "vld1.16 {q12}, [%[dst]] \n\t"
43 "vld4.8 {d0-d3}, [%[src]] \n\t"
44 "moveq ip, #8 \n\t"
45 "mov %[keep_dst], %[dst] \n\t"
46
47 "add %[src], %[src], ip, LSL#2 \n\t"
48 "add %[dst], %[dst], ip, LSL#1 \n\t"
49 "subs %[count], %[count], ip \n\t"
50 "b 9f \n\t"
51 // LOOP
52 "2: \n\t"
53
54 "vld1.16 {q12}, [%[dst]]! \n\t"
55 "vld4.8 {d0-d3}, [%[src]]! \n\t"
56 "vst1.16 {q10}, [%[keep_dst]] \n\t"
57 "sub %[keep_dst], %[dst], #8*2 \n\t"
58 "subs %[count], %[count], #8 \n\t"
59 "9: \n\t"
60 "pld [%[dst],#32] \n\t"
61 // expand 0565 q12 to 8888 {d4-d7}
62 "vmovn.u16 d4, q12 \n\t"
63 "vshr.u16 q11, q12, #5 \n\t"
64 "vshr.u16 q10, q12, #6+5 \n\t"
65 "vmovn.u16 d5, q11 \n\t"
66 "vmovn.u16 d6, q10 \n\t"
67 "vshl.u8 d4, d4, #3 \n\t"
68 "vshl.u8 d5, d5, #2 \n\t"
69 "vshl.u8 d6, d6, #3 \n\t"
70
71 "vmovl.u8 q14, d31 \n\t"
72 "vmovl.u8 q13, d31 \n\t"
73 "vmovl.u8 q12, d31 \n\t"
74
75 // duplicate in 4/2/1 & 8pix vsns
76 "vmvn.8 d30, d3 \n\t"
77 "vmlal.u8 q14, d30, d6 \n\t"
78 "vmlal.u8 q13, d30, d5 \n\t"
79 "vmlal.u8 q12, d30, d4 \n\t"
80 "vshr.u16 q8, q14, #5 \n\t"
81 "vshr.u16 q9, q13, #6 \n\t"
82 "vaddhn.u16 d6, q14, q8 \n\t"
83 "vshr.u16 q8, q12, #5 \n\t"
84 "vaddhn.u16 d5, q13, q9 \n\t"
85 "vqadd.u8 d6, d6, d0 \n\t" // moved up
86 "vaddhn.u16 d4, q12, q8 \n\t"
87 // intentionally don't calculate alpha
88 // result in d4-d6
89
90 "vqadd.u8 d5, d5, d1 \n\t"
91 "vqadd.u8 d4, d4, d2 \n\t"
92
93 // pack 8888 {d4-d6} to 0565 q10
94 "vshll.u8 q10, d6, #8 \n\t"
95 "vshll.u8 q3, d5, #8 \n\t"
96 "vshll.u8 q2, d4, #8 \n\t"
97 "vsri.u16 q10, q3, #5 \n\t"
98 "vsri.u16 q10, q2, #11 \n\t"
99
100 "bne 2b \n\t"
101
102 "1: \n\t"
103 "vst1.16 {q10}, [%[keep_dst]] \n\t"
104 : [count] "+r" (count)
105 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
106 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
107 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
108 "d30","d31"
109 );
reed@android.com522aa8d2009-10-22 20:26:53 +0000110 }
111 else
112 { // handle count < 8
reed@android.com7d2e3222009-07-30 02:22:31 +0000113 uint16_t* SK_RESTRICT keep_dst;
114
115 asm volatile (
116 "vmov.u8 d31, #1<<7 \n\t"
117 "mov %[keep_dst], %[dst] \n\t"
118
119 "tst %[count], #4 \n\t"
120 "beq 14f \n\t"
121 "vld1.16 {d25}, [%[dst]]! \n\t"
122 "vld1.32 {q1}, [%[src]]! \n\t"
123
124 "14: \n\t"
125 "tst %[count], #2 \n\t"
126 "beq 12f \n\t"
127 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
128 "vld1.32 {d1}, [%[src]]! \n\t"
129
130 "12: \n\t"
131 "tst %[count], #1 \n\t"
132 "beq 11f \n\t"
133 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
134 "vld1.32 {d0[1]}, [%[src]]! \n\t"
135
136 "11: \n\t"
137 // unzips achieve the same as a vld4 operation
138 "vuzpq.u16 q0, q1 \n\t"
139 "vuzp.u8 d0, d1 \n\t"
140 "vuzp.u8 d2, d3 \n\t"
141 // expand 0565 q12 to 8888 {d4-d7}
142 "vmovn.u16 d4, q12 \n\t"
143 "vshr.u16 q11, q12, #5 \n\t"
144 "vshr.u16 q10, q12, #6+5 \n\t"
145 "vmovn.u16 d5, q11 \n\t"
146 "vmovn.u16 d6, q10 \n\t"
147 "vshl.u8 d4, d4, #3 \n\t"
148 "vshl.u8 d5, d5, #2 \n\t"
149 "vshl.u8 d6, d6, #3 \n\t"
150
151 "vmovl.u8 q14, d31 \n\t"
152 "vmovl.u8 q13, d31 \n\t"
153 "vmovl.u8 q12, d31 \n\t"
154
155 // duplicate in 4/2/1 & 8pix vsns
156 "vmvn.8 d30, d3 \n\t"
157 "vmlal.u8 q14, d30, d6 \n\t"
158 "vmlal.u8 q13, d30, d5 \n\t"
159 "vmlal.u8 q12, d30, d4 \n\t"
160 "vshr.u16 q8, q14, #5 \n\t"
161 "vshr.u16 q9, q13, #6 \n\t"
162 "vaddhn.u16 d6, q14, q8 \n\t"
163 "vshr.u16 q8, q12, #5 \n\t"
164 "vaddhn.u16 d5, q13, q9 \n\t"
165 "vqadd.u8 d6, d6, d0 \n\t" // moved up
166 "vaddhn.u16 d4, q12, q8 \n\t"
167 // intentionally don't calculate alpha
168 // result in d4-d6
169
170 "vqadd.u8 d5, d5, d1 \n\t"
171 "vqadd.u8 d4, d4, d2 \n\t"
172
173 // pack 8888 {d4-d6} to 0565 q10
174 "vshll.u8 q10, d6, #8 \n\t"
175 "vshll.u8 q3, d5, #8 \n\t"
176 "vshll.u8 q2, d4, #8 \n\t"
177 "vsri.u16 q10, q3, #5 \n\t"
178 "vsri.u16 q10, q2, #11 \n\t"
179
180 // store
181 "tst %[count], #4 \n\t"
182 "beq 24f \n\t"
183 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
184
185 "24: \n\t"
186 "tst %[count], #2 \n\t"
187 "beq 22f \n\t"
188 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
189
190 "22: \n\t"
191 "tst %[count], #1 \n\t"
192 "beq 21f \n\t"
193 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
194
195 "21: \n\t"
196 : [count] "+r" (count)
197 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
198 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
199 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
200 "d30","d31"
201 );
202 }
203}
reed@android.com4bda7a52009-07-30 20:40:47 +0000204
205static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
206 const SkPMColor* SK_RESTRICT src, int count,
207 U8CPU alpha, int /*x*/, int /*y*/) {
reed@android.com229d9b32010-04-09 18:44:46 +0000208
209 U8CPU alpha_for_asm = alpha;
210
reed@android.com4bda7a52009-07-30 20:40:47 +0000211 asm volatile (
212 /* This code implements a Neon version of S32A_D565_Blend. The output differs from
213 * the original in two respects:
214 * 1. The results have a few mismatches compared to the original code. These mismatches
215 * never exceed 1. It's possible to improve accuracy vs. a floating point
216 * implementation by introducing rounding right shifts (vrshr) for the final stage.
217 * Rounding is not present in the code below, because although results would be closer
218 * to a floating point implementation, the number of mismatches compared to the
219 * original code would be far greater.
220 * 2. On certain inputs, the original code can overflow, causing colour channels to
221 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel
222 * to affect another.
223 */
224
reed@android.com229d9b32010-04-09 18:44:46 +0000225#if 1
226 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
227 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256
228#else
reed@android.com4bda7a52009-07-30 20:40:47 +0000229 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256
reed@android.com229d9b32010-04-09 18:44:46 +0000230#endif
reed@android.com4bda7a52009-07-30 20:40:47 +0000231 "vmov.u16 q3, #255 \n\t" // set up constant
232 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3
233 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon
234 "beq 2f \n\t" // if count8 == 0, exit
235 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
236
237 "1: \n\t"
238 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels
239 "subs r4, r4, #1 \n\t" // decrement loop counter
240 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels
241 // and deinterleave
242
243 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes
244 "vand q10, q0, q15 \n\t" // extract blue
245 "vshr.u16 q8, q0, #11 \n\t" // extract red
246 "vshr.u16 q9, q9, #10 \n\t" // extract green
247 // dstrgb = {q8, q9, q10}
248
249 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range
250 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range
251 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range
252
253 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits
254 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits
255 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits
256 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits
257 // srcrgba = {q11, q12, q13, q14}
258
259 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale
260 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale
261 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale
262 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale
263
264 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8
265 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8)
266 // dst_scale = q2
267
268 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale
269 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale
270 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale
reed@android.com229d9b32010-04-09 18:44:46 +0000271
272#if 1
273 // trying for a better match with SkDiv255Round(a)
274 // C alg is: a+=128; (a+a>>8)>>8
275 // we'll use just a rounding shift [q2 is available for scratch]
276 "vrshr.u16 q11, q11, #8 \n\t" // shift down red
277 "vrshr.u16 q12, q12, #8 \n\t" // shift down green
278 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue
279#else
280 // arm's original "truncating divide by 256"
reed@android.com4bda7a52009-07-30 20:40:47 +0000281 "vshr.u16 q11, q11, #8 \n\t" // shift down red
282 "vshr.u16 q12, q12, #8 \n\t" // shift down green
283 "vshr.u16 q13, q13, #8 \n\t" // shift down blue
reed@android.com229d9b32010-04-09 18:44:46 +0000284#endif
reed@android.com4bda7a52009-07-30 20:40:47 +0000285
286 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue
287 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue
288 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr
289
290 "bne 1b \n\t" // if counter != 0, loop
291 "2: \n\t" // exit
292
reed@android.com229d9b32010-04-09 18:44:46 +0000293 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
reed@android.com4bda7a52009-07-30 20:40:47 +0000294 :
295 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
296 );
reed@android.com229d9b32010-04-09 18:44:46 +0000297
reed@android.com4bda7a52009-07-30 20:40:47 +0000298 count &= 7;
299 if (count > 0) {
300 do {
301 SkPMColor sc = *src++;
reed@android.com8e4c93b2010-03-09 15:21:28 +0000302 if (sc) {
reed@android.com4bda7a52009-07-30 20:40:47 +0000303 uint16_t dc = *dst;
reed@android.com8e4c93b2010-03-09 15:21:28 +0000304 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
305 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
306 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
307 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
308 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
reed@android.com4bda7a52009-07-30 20:40:47 +0000309 }
310 dst += 1;
311 } while (--count != 0);
312 }
313}
314
315/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
316 * each dither value is spaced out into byte lanes, and repeated
317 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
318 * start of each row.
319 */
320static const uint8_t gDitherMatrix_Neon[48] = {
321 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
322 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
323 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
324 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
325
326};
327
328static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
329 int count, U8CPU alpha, int x, int y)
330{
331 /* select row and offset for dither array */
332 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
333
334 /* rescale alpha to range 0 - 256 */
335 int scale = SkAlpha255To256(alpha);
336
337 asm volatile (
338 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values
339 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values
340 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg
341 "vmov.i8 d29, #0x3f \n\t" // set up green mask
342 "vmov.i8 d28, #0x1f \n\t" // set up blue mask
343 "1: \n\t"
344 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb
345 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5
346 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6
347 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5
348 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen
349 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen
350 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen
351 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result
352 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result
353 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result
354 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits
355 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits
356 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits
357 // load 8 pixels from dst, extract rgb
358 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels
359 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits
360 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes
361 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red
362 "vand d17, d17, d29 \n\t" // and green with green mask
363 "vand d18, d18, d28 \n\t" // and blue with blue mask
364 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes
365 // src = {d22 (r), d23 (g), d24 (b)}
366 // dst = {d16 (r), d17 (g), d18 (b)}
367 // subtract dst from src and widen
368 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst
369 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst
370 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst
371 // multiply diffs by scale and shift
372 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale
373 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale
374 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale
375 "subs %[count], %[count], #8 \n\t" // decrement loop counter
376 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow
377 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow
378 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow
379 // add dst to result
380 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red
381 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green
382 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue
383 // put result into 565 format
384 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue
385 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue
386 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result
387 "bgt 1b \n\t" // loop if count > 0
388 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
389 : [dstart] "r" (dstart), [scale] "r" (scale)
390 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
391 );
392
393 DITHER_565_SCAN(y);
394
395 while((count & 7) > 0)
396 {
397 SkPMColor c = *src++;
398
399 int dither = DITHER_VALUE(x);
400 int sr = SkGetPackedR32(c);
401 int sg = SkGetPackedG32(c);
402 int sb = SkGetPackedB32(c);
403 sr = SkDITHER_R32To565(sr, dither);
404 sg = SkDITHER_G32To565(sg, dither);
405 sb = SkDITHER_B32To565(sb, dither);
406
407 uint16_t d = *dst;
408 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
409 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
410 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
411 DITHER_INC_X(x);
412 count--;
413 }
414}
415
416#define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon
417#define S32A_D565_Blend_PROC S32A_D565_Blend_neon
418#define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon
reed@android.com7d2e3222009-07-30 02:22:31 +0000419#else
reed@android.com4bda7a52009-07-30 20:40:47 +0000420#define S32A_D565_Opaque_PROC NULL
421#define S32A_D565_Blend_PROC NULL
422#define S32_D565_Blend_Dither_PROC NULL
reed@android.com7d2e3222009-07-30 02:22:31 +0000423#endif
424
425/* Don't have a special version that assumes each src is opaque, but our S32A
426 is still faster than the default, so use it here
427 */
reed@android.com4bda7a52009-07-30 20:40:47 +0000428#define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC
429#define S32_D565_Blend_PROC S32A_D565_Blend_PROC
reed@android.com7d2e3222009-07-30 02:22:31 +0000430
431///////////////////////////////////////////////////////////////////////////////
432
reed@android.coma98a21e2009-10-19 18:13:18 +0000433#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com4e635f92009-10-19 17:39:46 +0000434
435static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
436 const SkPMColor* SK_RESTRICT src,
437 int count, U8CPU alpha) {
438
439 SkASSERT(255 == alpha);
440 if (count > 0) {
441
reed@android.com522aa8d2009-10-22 20:26:53 +0000442
443 uint8x8_t alpha_mask;
444
445 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
446 alpha_mask = vld1_u8(alpha_mask_setup);
447
reed@android.com4e635f92009-10-19 17:39:46 +0000448 /* do the NEON unrolled code */
449#define UNROLL 4
450 while (count >= UNROLL) {
451 uint8x8_t src_raw, dst_raw, dst_final;
452 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
reed@android.com4e635f92009-10-19 17:39:46 +0000453
454 /* get the source */
455 src_raw = vreinterpret_u8_u32(vld1_u32(src));
456#if UNROLL > 2
457 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
458#endif
459
460 /* get and hold the dst too */
461 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
462#if UNROLL > 2
463 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
464#endif
465
reed@android.com4e635f92009-10-19 17:39:46 +0000466 /* 1st and 2nd bits of the unrolling */
467 {
468 uint8x8_t dst_cooked;
469 uint16x8_t dst_wide;
470 uint8x8_t alpha_narrow;
471 uint16x8_t alpha_wide;
472
473 /* get the alphas spread out properly */
474 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
reed@android.com229d9b32010-04-09 18:44:46 +0000475#if 1
476 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
477 /* we collapsed (255-a)+1 ... */
478 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
479#else
reed@android.com522aa8d2009-10-22 20:26:53 +0000480 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
reed@android.com4e635f92009-10-19 17:39:46 +0000481 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
reed@android.com229d9b32010-04-09 18:44:46 +0000482#endif
reed@android.com4e635f92009-10-19 17:39:46 +0000483
reed@android.com522aa8d2009-10-22 20:26:53 +0000484 /* spread the dest */
reed@android.com4e635f92009-10-19 17:39:46 +0000485 dst_wide = vmovl_u8(dst_raw);
486
487 /* alpha mul the dest */
488 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
489 dst_cooked = vshrn_n_u16(dst_wide, 8);
490
491 /* sum -- ignoring any byte lane overflows */
492 dst_final = vadd_u8(src_raw, dst_cooked);
493 }
reed@android.com4e635f92009-10-19 17:39:46 +0000494
495#if UNROLL > 2
496 /* the 3rd and 4th bits of our unrolling */
497 {
498 uint8x8_t dst_cooked;
499 uint16x8_t dst_wide;
500 uint8x8_t alpha_narrow;
501 uint16x8_t alpha_wide;
502
503 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
reed@android.com229d9b32010-04-09 18:44:46 +0000504#if 1
505 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
506 /* we collapsed (255-a)+1 ... */
507 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
508#else
reed@android.com522aa8d2009-10-22 20:26:53 +0000509 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
reed@android.com4e635f92009-10-19 17:39:46 +0000510 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
reed@android.com229d9b32010-04-09 18:44:46 +0000511#endif
reed@android.com4e635f92009-10-19 17:39:46 +0000512
reed@android.com522aa8d2009-10-22 20:26:53 +0000513 /* spread the dest */
reed@android.com4e635f92009-10-19 17:39:46 +0000514 dst_wide = vmovl_u8(dst_raw_2);
515
516 /* alpha mul the dest */
517 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
518 dst_cooked = vshrn_n_u16(dst_wide, 8);
519
520 /* sum -- ignoring any byte lane overflows */
521 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
522 }
523#endif
524
525 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
526#if UNROLL > 2
527 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
528#endif
529
530 src += UNROLL;
531 dst += UNROLL;
532 count -= UNROLL;
533 }
534#undef UNROLL
535
536 /* do any residual iterations */
537 while (--count >= 0) {
538#ifdef TEST_SRC_ALPHA
539 SkPMColor sc = *src;
540 if (sc) {
541 unsigned srcA = SkGetPackedA32(sc);
542 SkPMColor result = sc;
543 if (srcA != 255) {
544 result = SkPMSrcOver(sc, *dst);
545 }
546 *dst = result;
547 }
548#else
549 *dst = SkPMSrcOver(*src, *dst);
550#endif
551 src += 1;
552 dst += 1;
553 }
554 }
555}
556
agl@chromium.orga40390c2010-06-18 15:30:12 +0000557#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon
agl@chromium.org94d14642010-08-17 16:24:15 +0000558
reed@android.com4e635f92009-10-19 17:39:46 +0000559#else
agl@chromium.org94d14642010-08-17 16:24:15 +0000560
561#ifdef TEST_SRC_ALPHA
562#error The ARM asm version of S32A_Opaque_BlitRow32 does not support TEST_SRC_ALPHA
563#endif
564
565static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
566 const SkPMColor* SK_RESTRICT src,
567 int count, U8CPU alpha) {
568
569 SkASSERT(255 == alpha);
570
571 /* Does not support the TEST_SRC_ALPHA case */
572 asm volatile (
573 "cmp %[count], #0 \n\t" /* comparing count with 0 */
574 "beq 3f \n\t" /* if zero exit */
575
576 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */
577 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */
578
579 "cmp %[count], #2 \n\t" /* compare count with 2 */
580 "blt 2f \n\t" /* if less than 2 -> single loop */
581
582 /* Double Loop */
583 "1: \n\t" /* <double loop> */
584 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */
585 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */
586 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
587
588 /* ----------- */
589 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
590 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
591 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
592
593 "mul r9, r9, r4 \n\t" /* br = br * scale */
594 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
595 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
596
597 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
598 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
599 "orr r7, r9, r10 \n\t" /* br | ag*/
600
601 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */
602 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */
603
604 /* ----------- */
605 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */
606
607 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */
608 "mul r9, r9, r4 \n\t" /* br = br * scale */
609 "sub %[count], %[count], #2 \n\t"
610 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
611
612 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
613 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
614 "cmp %[count], #1 \n\t" /* comparing count with 1 */
615 "orr r8, r9, r10 \n\t" /* br | ag */
616
617 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */
618
619 /* ----------------- */
620 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */
621 /* ----------------- */
622
623 "bgt 1b \n\t" /* if greater than 1 -> reloop */
624 "blt 3f \n\t" /* if less than 1 -> exit */
625
626 /* Single Loop */
627 "2: \n\t" /* <single loop> */
628 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */
629 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */
630 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
631
632 /* ----------- */
633 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
634 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
635
636 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
637 "mul r9, r9, r4 \n\t" /* br = br * scale */
638 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
639 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
640
641 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */
642 "orr r7, r9, r10 \n\t" /* br | ag */
643
644 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */
645
646 /* ----------------- */
647 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */
648 /* ----------------- */
649
650 "3: \n\t" /* <exit> */
651 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
652 :
653 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
654 );
655}
656#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm
reed@android.com4e635f92009-10-19 17:39:46 +0000657#endif
658
659/* Neon version of S32_Blend_BlitRow32()
reed@android.com522aa8d2009-10-22 20:26:53 +0000660 * portable version is in src/core/SkBlitRow_D32.cpp
reed@android.com4e635f92009-10-19 17:39:46 +0000661 */
reed@android.coma98a21e2009-10-19 18:13:18 +0000662#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com4e635f92009-10-19 17:39:46 +0000663static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
664 const SkPMColor* SK_RESTRICT src,
665 int count, U8CPU alpha) {
666 SkASSERT(alpha <= 255);
667 if (count > 0) {
668 uint16_t src_scale = SkAlpha255To256(alpha);
669 uint16_t dst_scale = 256 - src_scale;
670
671 /* run them N at a time through the NEON unit */
672 /* note that each 1 is 4 bytes, each treated exactly the same,
673 * so we can work under that guise. We *do* know that the src&dst
674 * will be 32-bit aligned quantities, so we can specify that on
675 * the load/store ops and do a neon 'reinterpret' to get us to
676 * byte-sized (pun intended) pieces that we widen/multiply/shift
677 * we're limited at 128 bits in the wide ops, which is 8x16bits
678 * or a pair of 32 bit src/dsts.
679 */
680 /* we *could* manually unroll this loop so that we load 128 bits
681 * (as a pair of 64s) from each of src and dst, processing them
682 * in pieces. This might give us a little better management of
683 * the memory latency, but my initial attempts here did not
684 * produce an instruction stream that looked all that nice.
685 */
686#define UNROLL 2
687 while (count >= UNROLL) {
688 uint8x8_t src_raw, dst_raw, dst_final;
689 uint16x8_t src_wide, dst_wide;
690
691 /* get 64 bits of src, widen it, multiply by src_scale */
692 src_raw = vreinterpret_u8_u32(vld1_u32(src));
693 src_wide = vmovl_u8(src_raw);
reed@android.com522aa8d2009-10-22 20:26:53 +0000694 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
reed@android.com4e635f92009-10-19 17:39:46 +0000695 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
696
697 /* ditto with dst */
698 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
699 dst_wide = vmovl_u8(dst_raw);
reed@android.com4e635f92009-10-19 17:39:46 +0000700
reed@android.com522aa8d2009-10-22 20:26:53 +0000701 /* combine add with dst multiply into mul-accumulate */
702 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
703
reed@android.com4e635f92009-10-19 17:39:46 +0000704 dst_final = vshrn_n_u16(dst_wide, 8);
reed@android.com4e635f92009-10-19 17:39:46 +0000705 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
706
707 src += UNROLL;
708 dst += UNROLL;
709 count -= UNROLL;
710 }
711 /* RBE: well, i don't like how gcc manages src/dst across the above
712 * loop it's constantly calculating src+bias, dst+bias and it only
713 * adjusts the real ones when we leave the loop. Not sure why
714 * it's "hoisting down" (hoisting implies above in my lexicon ;))
715 * the adjustments to src/dst/count, but it does...
716 * (might be SSA-style internal logic...
717 */
718
719#if UNROLL == 2
720 if (count == 1) {
721 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
722 }
723#else
724 if (count > 0) {
725 do {
726 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
727 src += 1;
728 dst += 1;
729 } while (--count > 0);
730 }
731#endif
732
733#undef UNROLL
734 }
735}
736
737#define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon
738#else
739#define S32_Blend_BlitRow32_PROC NULL
740#endif
741
742///////////////////////////////////////////////////////////////////////////////
743
reed@android.coma98a21e2009-10-19 18:13:18 +0000744#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com522aa8d2009-10-22 20:26:53 +0000745
746#undef DEBUG_OPAQUE_DITHER
747
748#if defined(DEBUG_OPAQUE_DITHER)
749static void showme8(char *str, void *p, int len)
750{
751 static char buf[256];
752 char tbuf[32];
753 int i;
754 char *pc = (char*) p;
755 sprintf(buf,"%8s:", str);
756 for(i=0;i<len;i++) {
757 sprintf(tbuf, " %02x", pc[i]);
758 strcat(buf, tbuf);
759 }
760 SkDebugf("%s\n", buf);
761}
762static void showme16(char *str, void *p, int len)
763{
764 static char buf[256];
765 char tbuf[32];
766 int i;
767 uint16_t *pc = (uint16_t*) p;
768 sprintf(buf,"%8s:", str);
769 len = (len / sizeof(uint16_t)); /* passed as bytes */
770 for(i=0;i<len;i++) {
771 sprintf(tbuf, " %04x", pc[i]);
772 strcat(buf, tbuf);
773 }
774 SkDebugf("%s\n", buf);
775}
776#endif
777
778static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
reed@android.com4e635f92009-10-19 17:39:46 +0000779 const SkPMColor* SK_RESTRICT src,
780 int count, U8CPU alpha, int x, int y) {
781 SkASSERT(255 == alpha);
reed@android.com522aa8d2009-10-22 20:26:53 +0000782
783#define UNROLL 8
784
785 if (count >= UNROLL) {
786 uint8x8_t dbase;
787
788#if defined(DEBUG_OPAQUE_DITHER)
789 uint16_t tmpbuf[UNROLL];
790 int td[UNROLL];
791 int tdv[UNROLL];
792 int ta[UNROLL];
793 int tap[UNROLL];
794 uint16_t in_dst[UNROLL];
795 int offset = 0;
796 int noisy = 0;
797#endif
798
799 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
800 dbase = vld1_u8(dstart);
801
802 do {
803 uint8x8_t sr, sg, sb, sa, d;
reed@android.com229d9b32010-04-09 18:44:46 +0000804 uint16x8_t dst8, scale8, alpha8;
reed@android.com522aa8d2009-10-22 20:26:53 +0000805 uint16x8_t dst_r, dst_g, dst_b;
806
807#if defined(DEBUG_OPAQUE_DITHER)
808 /* calculate 8 elements worth into a temp buffer */
809 {
810 int my_y = y;
811 int my_x = x;
812 SkPMColor* my_src = (SkPMColor*)src;
813 uint16_t* my_dst = dst;
814 int i;
815
816 DITHER_565_SCAN(my_y);
817 for(i=0;i<UNROLL;i++) {
818 SkPMColor c = *my_src++;
819 SkPMColorAssert(c);
820 if (c) {
821 unsigned a = SkGetPackedA32(c);
822
823 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
824 tdv[i] = DITHER_VALUE(my_x);
825 ta[i] = a;
826 tap[i] = SkAlpha255To256(a);
827 td[i] = d;
828
829 unsigned sr = SkGetPackedR32(c);
830 unsigned sg = SkGetPackedG32(c);
831 unsigned sb = SkGetPackedB32(c);
832 sr = SkDITHER_R32_FOR_565(sr, d);
833 sg = SkDITHER_G32_FOR_565(sg, d);
834 sb = SkDITHER_B32_FOR_565(sb, d);
835
836 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
837 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
838 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
839 // now src and dst expanded are in g:11 r:10 x:1 b:10
840 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
841 td[i] = d;
842
843 } else {
844 tmpbuf[i] = *my_dst;
845 ta[i] = tdv[i] = td[i] = 0xbeef;
846 }
847 in_dst[i] = *my_dst;
848 my_dst += 1;
849 DITHER_INC_X(my_x);
850 }
851 }
852#endif
853
854 /* source is in ABGR */
855 {
856 register uint8x8_t d0 asm("d0");
857 register uint8x8_t d1 asm("d1");
858 register uint8x8_t d2 asm("d2");
859 register uint8x8_t d3 asm("d3");
860
861 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
862 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
863 : "r" (src)
864 );
865 sr = d0; sg = d1; sb = d2; sa = d3;
866 }
867
868 /* calculate 'd', which will be 0..7 */
869 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
agl@chromium.orge932a422010-04-26 21:45:08 +0000870#if ANDROID
reed@android.com229d9b32010-04-09 18:44:46 +0000871 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
872 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
873#else
874 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
875#endif
876 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
877 d = vshrn_n_u16(alpha8, 8); /* narrowing too */
reed@android.com522aa8d2009-10-22 20:26:53 +0000878
879 /* sr = sr - (sr>>5) + d */
880 /* watching for 8-bit overflow. d is 0..7; risky range of
881 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
882 * safe as long as we do ((sr-sr>>5) + d) */
883 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
884 sr = vadd_u8(sr, d);
885
886 /* sb = sb - (sb>>5) + d */
887 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
888 sb = vadd_u8(sb, d);
889
890 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
891 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
892 sg = vadd_u8(sg, vshr_n_u8(d,1));
893
894 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
895 dst8 = vld1q_u16(dst);
896 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
897 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
898 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
899
900 /* blend */
reed@android.com229d9b32010-04-09 18:44:46 +0000901#if 1
902 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
903 /* originally 255-sa + 1 */
904 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
905#else
reed@android.com522aa8d2009-10-22 20:26:53 +0000906 scale8 = vsubw_u8(vdupq_n_u16(255), sa);
907 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
reed@android.com229d9b32010-04-09 18:44:46 +0000908#endif
909
910#if 1
911 /* combine the addq and mul, save 3 insns */
912 scale8 = vshrq_n_u16(scale8, 3);
913 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
914 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
915 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
916#else
917 /* known correct, but +3 insns over above */
reed@android.com522aa8d2009-10-22 20:26:53 +0000918 scale8 = vshrq_n_u16(scale8, 3);
919 dst_b = vmulq_u16(dst_b, scale8);
920 dst_g = vmulq_u16(dst_g, scale8);
921 dst_r = vmulq_u16(dst_r, scale8);
922
923 /* combine */
924 /* NB: vshll widens, need to preserve those bits */
925 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
926 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
927 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
reed@android.com229d9b32010-04-09 18:44:46 +0000928#endif
reed@android.com522aa8d2009-10-22 20:26:53 +0000929
930 /* repack to store */
931 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
932 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
933 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
934
935 vst1q_u16(dst, dst8);
936
937#if defined(DEBUG_OPAQUE_DITHER)
938 /* verify my 8 elements match the temp buffer */
939 {
940 int i, bad=0;
941 static int invocation;
942
943 for (i=0;i<UNROLL;i++)
944 if (tmpbuf[i] != dst[i]) bad=1;
reed@android.com229d9b32010-04-09 18:44:46 +0000945 if (bad) {
reed@android.com522aa8d2009-10-22 20:26:53 +0000946 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
947 invocation, offset);
reed@android.com229d9b32010-04-09 18:44:46 +0000948 SkDebugf(" alpha 0x%x\n", alpha);
reed@android.com522aa8d2009-10-22 20:26:53 +0000949 for (i=0;i<UNROLL;i++)
950 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
951 i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
952 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
953
reed@android.com229d9b32010-04-09 18:44:46 +0000954 showme16("alpha8", &alpha8, sizeof(alpha8));
955 showme16("scale8", &scale8, sizeof(scale8));
956 showme8("d", &d, sizeof(d));
957 showme16("dst8", &dst8, sizeof(dst8));
958 showme16("dst_b", &dst_b, sizeof(dst_b));
959 showme16("dst_g", &dst_g, sizeof(dst_g));
960 showme16("dst_r", &dst_r, sizeof(dst_r));
961 showme8("sb", &sb, sizeof(sb));
962 showme8("sg", &sg, sizeof(sg));
963 showme8("sr", &sr, sizeof(sr));
964
reed@android.com522aa8d2009-10-22 20:26:53 +0000965 /* cop out */
966 return;
967 }
968 offset += UNROLL;
969 invocation++;
970 }
971#endif
972
973 dst += UNROLL;
974 src += UNROLL;
975 count -= UNROLL;
976 /* skip x += UNROLL, since it's unchanged mod-4 */
977 } while (count >= UNROLL);
978 }
979#undef UNROLL
980
981 /* residuals */
reed@android.com4e635f92009-10-19 17:39:46 +0000982 if (count > 0) {
983 DITHER_565_SCAN(y);
984 do {
985 SkPMColor c = *src++;
986 SkPMColorAssert(c);
reed@android.com4e635f92009-10-19 17:39:46 +0000987 if (c) {
reed@android.com4e635f92009-10-19 17:39:46 +0000988 unsigned a = SkGetPackedA32(c);
989
reed@android.comd7a00832010-01-14 00:24:12 +0000990 // dither and alpha are just temporary variables to work-around
991 // an ICE in debug.
992 unsigned dither = DITHER_VALUE(x);
993 unsigned alpha = SkAlpha255To256(a);
994 int d = SkAlphaMul(dither, alpha);
reed@android.com4e635f92009-10-19 17:39:46 +0000995
996 unsigned sr = SkGetPackedR32(c);
997 unsigned sg = SkGetPackedG32(c);
998 unsigned sb = SkGetPackedB32(c);
reed@android.com4e635f92009-10-19 17:39:46 +0000999 sr = SkDITHER_R32_FOR_565(sr, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001000 sg = SkDITHER_G32_FOR_565(sg, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001001 sb = SkDITHER_B32_FOR_565(sb, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001002
reed@android.com4e635f92009-10-19 17:39:46 +00001003 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1004 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
reed@android.com4e635f92009-10-19 17:39:46 +00001005 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1006 // now src and dst expanded are in g:11 r:10 x:1 b:10
1007 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1008 }
1009 dst += 1;
reed@android.com4e635f92009-10-19 17:39:46 +00001010 DITHER_INC_X(x);
1011 } while (--count != 0);
1012 }
1013}
1014
1015#define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
1016#else
1017#define S32A_D565_Opaque_Dither_PROC NULL
1018#endif
1019
1020///////////////////////////////////////////////////////////////////////////////
1021
reed@android.comb577b412009-10-27 17:49:32 +00001022#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1023/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1024 * speedup untested, but ARM version is 26 insns/iteration and
1025 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1026 * which is 10x the native version; that's pure instruction counts,
1027 * not accounting for any instruction or memory latencies.
1028 */
1029
1030#undef DEBUG_S32_OPAQUE_DITHER
1031
1032static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1033 const SkPMColor* SK_RESTRICT src,
1034 int count, U8CPU alpha, int x, int y) {
1035 SkASSERT(255 == alpha);
1036
1037#define UNROLL 8
1038 if (count >= UNROLL) {
1039 uint8x8_t d;
1040 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1041 d = vld1_u8(dstart);
1042
1043 while (count >= UNROLL) {
1044 uint8x8_t sr, sg, sb, sa;
1045 uint16x8_t dr, dg, db, da;
1046 uint16x8_t dst8;
1047
1048 /* source is in ABGR ordering (R == lsb) */
1049 {
1050 register uint8x8_t d0 asm("d0");
1051 register uint8x8_t d1 asm("d1");
1052 register uint8x8_t d2 asm("d2");
1053 register uint8x8_t d3 asm("d3");
1054
1055 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1056 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1057 : "r" (src)
1058 );
1059 sr = d0; sg = d1; sb = d2; sa = d3;
1060 }
1061 /* XXX: if we want to prefetch, hide it in the above asm()
1062 * using the gcc __builtin_prefetch(), the prefetch will
1063 * fall to the bottom of the loop -- it won't stick up
1064 * at the top of the loop, just after the vld4.
1065 */
1066
1067 /* sr = sr - (sr>>5) + d */
1068 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1069 dr = vaddl_u8(sr, d);
1070
1071 /* sb = sb - (sb>>5) + d */
1072 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1073 db = vaddl_u8(sb, d);
1074
1075 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1076 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1077 dg = vaddl_u8(sg, vshr_n_u8(d,1));
1078 /* XXX: check that the "d>>1" here is hoisted */
1079
1080 /* pack high bits of each into 565 format (rgb, b is lsb) */
1081 dst8 = vshrq_n_u16(db, 3);
1082 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1083 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1084
1085 /* store it */
1086 vst1q_u16(dst, dst8);
1087
1088#if defined(DEBUG_S32_OPAQUE_DITHER)
1089 /* always good to know if we generated good results */
1090 {
1091 int i, myx = x, myy = y;
1092 DITHER_565_SCAN(myy);
1093 for (i=0;i<UNROLL;i++) {
1094 SkPMColor c = src[i];
1095 unsigned dither = DITHER_VALUE(myx);
1096 uint16_t val = SkDitherRGB32To565(c, dither);
1097 if (val != dst[i]) {
1098 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1099 c, dither, val, dst[i], dstart[i]);
1100 }
1101 DITHER_INC_X(myx);
1102 }
1103 }
1104#endif
1105
1106 dst += UNROLL;
1107 src += UNROLL;
1108 count -= UNROLL;
1109 x += UNROLL; /* probably superfluous */
1110 }
1111 }
1112#undef UNROLL
1113
1114 /* residuals */
1115 if (count > 0) {
1116 DITHER_565_SCAN(y);
1117 do {
1118 SkPMColor c = *src++;
1119 SkPMColorAssert(c);
1120 SkASSERT(SkGetPackedA32(c) == 255);
1121
1122 unsigned dither = DITHER_VALUE(x);
1123 *dst++ = SkDitherRGB32To565(c, dither);
1124 DITHER_INC_X(x);
1125 } while (--count != 0);
1126 }
1127}
1128
1129#define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1130#else
1131#define S32_D565_Opaque_Dither_PROC NULL
1132#endif
1133
1134///////////////////////////////////////////////////////////////////////////////
1135
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001136static const SkBlitRow::Proc platform_565_procs[] = {
reed@android.com7d2e3222009-07-30 02:22:31 +00001137 // no dither
reed@android.com4bda7a52009-07-30 20:40:47 +00001138 S32_D565_Opaque_PROC,
1139 S32_D565_Blend_PROC,
1140 S32A_D565_Opaque_PROC,
reed@android.com229d9b32010-04-09 18:44:46 +00001141 S32A_D565_Blend_PROC,
reed@android.com7d2e3222009-07-30 02:22:31 +00001142
1143 // dither
reed@android.comb577b412009-10-27 17:49:32 +00001144 S32_D565_Opaque_Dither_PROC,
reed@android.com4bda7a52009-07-30 20:40:47 +00001145 S32_D565_Blend_Dither_PROC,
reed@android.com229d9b32010-04-09 18:44:46 +00001146 S32A_D565_Opaque_Dither_PROC,
reed@android.com7d2e3222009-07-30 02:22:31 +00001147 NULL, // S32A_D565_Blend_Dither
1148};
1149
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001150static const SkBlitRow::Proc platform_4444_procs[] = {
reed@android.com7d2e3222009-07-30 02:22:31 +00001151 // no dither
1152 NULL, // S32_D4444_Opaque,
1153 NULL, // S32_D4444_Blend,
1154 NULL, // S32A_D4444_Opaque,
1155 NULL, // S32A_D4444_Blend,
1156
1157 // dither
1158 NULL, // S32_D4444_Opaque_Dither,
1159 NULL, // S32_D4444_Blend_Dither,
1160 NULL, // S32A_D4444_Opaque_Dither,
1161 NULL, // S32A_D4444_Blend_Dither
1162};
1163
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001164static const SkBlitRow::Proc32 platform_32_procs[] = {
reed@android.com3bbac132009-09-23 18:48:10 +00001165 NULL, // S32_Opaque,
reed@android.com4e635f92009-10-19 17:39:46 +00001166 S32_Blend_BlitRow32_PROC, // S32_Blend,
1167 S32A_Opaque_BlitRow32_PROC, // S32A_Opaque,
reed@android.com3bbac132009-09-23 18:48:10 +00001168 NULL, // S32A_Blend,
1169};
1170
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001171SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001172 return platform_4444_procs[flags];
1173}
1174
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001175SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001176 return platform_565_procs[flags];
1177}
1178
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001179SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001180 return platform_32_procs[flags];
1181}
reed@android.com229d9b32010-04-09 18:44:46 +00001182