blob: 7d75143e20bda1dd6d3687c6c35ba0da955f1a17 [file] [log] [blame]
rmistry@google.comfbfcd562012-08-23 18:09:54 +00001/* NEON optimized code (C) COPYRIGHT 2009 Motorola
digit@google.comfce02ac2012-08-01 14:25:07 +00002 *
3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file.
5 */
6
7#include "SkBitmapProcState.h"
8#include "SkPerspIter.h"
9#include "SkShader.h"
10#include "SkUtilsArm.h"
11
12extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
13extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
14
15static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
16static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
17
18static unsigned SK_USHIFT16(unsigned x) {
19 return x >> 16;
20}
21
22#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon
23#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
24#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
25#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
26#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
27#define CHECK_FOR_DECAL
28#include "SkBitmapProcState_matrix_clamp_neon.h"
29
30#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon
31#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
32#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
33#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
34#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
35#include "SkBitmapProcState_matrix_repeat_neon.h"
36
37
38void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
39{
40 int i;
41
42 if (count >= 8) {
43 /* SkFixed is 16.16 fixed point */
44 SkFixed dx2 = dx+dx;
45 SkFixed dx4 = dx2+dx2;
46 SkFixed dx8 = dx4+dx4;
47
48 /* now build fx/fx+dx/fx+2dx/fx+3dx */
49 SkFixed fx1, fx2, fx3;
digit@google.comfce02ac2012-08-01 14:25:07 +000050 int32x4_t lbase, hbase;
51 uint16_t *dst16 = (uint16_t *)dst;
52
53 fx1 = fx+dx;
54 fx2 = fx1+dx;
55 fx3 = fx2+dx;
56
57 /* avoid an 'lbase unitialized' warning */
58 lbase = vdupq_n_s32(fx);
59 lbase = vsetq_lane_s32(fx1, lbase, 1);
60 lbase = vsetq_lane_s32(fx2, lbase, 2);
61 lbase = vsetq_lane_s32(fx3, lbase, 3);
62 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
63
64 /* take upper 16 of each, store, and bump everything */
65 do {
66 int32x4_t lout, hout;
67 uint16x8_t hi16;
68
69 lout = lbase;
70 hout = hbase;
71 /* gets hi's of all louts then hi's of all houts */
72 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
73 hi16 = vreinterpretq_u16_s32(hout);
74 vst1q_u16(dst16, hi16);
75
76 /* on to the next */
77 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
78 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
79 dst16 += 8;
80 count -= 8;
81 fx += dx8;
82 } while (count >= 8);
83 dst = (uint32_t *) dst16;
84 }
85
86 uint16_t* xx = (uint16_t*)dst;
87 for (i = count; i > 0; --i) {
88 *xx++ = SkToU16(fx >> 16); fx += dx;
89 }
90}
91
92void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
93{
94 if (count >= 8) {
95 int32x4_t wide_fx;
96 int32x4_t wide_fx2;
97 int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
98
99 wide_fx = vdupq_n_s32(fx);
100 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
101 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
102 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
103
104 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
105
106 while (count >= 8) {
107 int32x4_t wide_out;
108 int32x4_t wide_out2;
109
110 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
111 wide_out = vorrq_s32(wide_out,
112 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
113
114 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
115 wide_out2 = vorrq_s32(wide_out2,
116 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
117
118 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
119 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
120
121 dst += 8;
122 fx += dx*8;
123 wide_fx = vaddq_s32(wide_fx, wide_dx8);
124 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
125 count -= 8;
126 }
127 }
128
129 if (count & 1)
130 {
131 SkASSERT((fx >> (16 + 14)) == 0);
132 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
133 fx += dx;
134 }
135 while ((count -= 2) >= 0)
136 {
137 SkASSERT((fx >> (16 + 14)) == 0);
138 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
139 fx += dx;
140
141 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
142 fx += dx;
143 }
144}