blob: e81da6705263894b3d529f2782d0a10fd93c26ef [file] [log] [blame]
rmistry@google.comfbfcd562012-08-23 18:09:54 +00001/* NEON optimized code (C) COPYRIGHT 2009 Motorola
digit@google.comfce02ac2012-08-01 14:25:07 +00002 *
3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file.
5 */
6
7#include "SkBitmapProcState.h"
8#include "SkPerspIter.h"
9#include "SkShader.h"
10#include "SkUtilsArm.h"
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000011#include "SkBitmapProcState_utils.h"
digit@google.comfce02ac2012-08-01 14:25:07 +000012
13extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
14extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
15
16static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
17static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
18
digit@google.comfce02ac2012-08-01 14:25:07 +000019#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon
20#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
21#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
22#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
23#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
24#define CHECK_FOR_DECAL
25#include "SkBitmapProcState_matrix_clamp_neon.h"
26
27#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon
28#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
29#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
30#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
31#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
32#include "SkBitmapProcState_matrix_repeat_neon.h"
33
34
digit@google.comfce02ac2012-08-01 14:25:07 +000035
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000036void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
digit@google.comfce02ac2012-08-01 14:25:07 +000037 if (count >= 8) {
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000038 // SkFixed is 16.16 fixed point
39 SkFixed dx8 = dx * 8;
40 int32x4_t vdx8 = vdupq_n_s32(dx8);
digit@google.comfce02ac2012-08-01 14:25:07 +000041
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000042 // setup lbase and hbase
digit@google.comfce02ac2012-08-01 14:25:07 +000043 int32x4_t lbase, hbase;
digit@google.comfce02ac2012-08-01 14:25:07 +000044 lbase = vdupq_n_s32(fx);
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000045 lbase = vsetq_lane_s32(fx + dx, lbase, 1);
46 lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2);
47 lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3);
48 hbase = lbase + vdupq_n_s32(4 * dx);
digit@google.comfce02ac2012-08-01 14:25:07 +000049
digit@google.comfce02ac2012-08-01 14:25:07 +000050 do {
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000051 // store the upper 16 bits
52 vst1q_u32(dst, vreinterpretq_u32_s16(
53 vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1]
54 ));
digit@google.comfce02ac2012-08-01 14:25:07 +000055
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000056 // on to the next group of 8
57 lbase += vdx8;
58 hbase += vdx8;
59 dst += 4; // we did 8 elements but the result is twice smaller
digit@google.comfce02ac2012-08-01 14:25:07 +000060 count -= 8;
61 fx += dx8;
62 } while (count >= 8);
digit@google.comfce02ac2012-08-01 14:25:07 +000063 }
64
65 uint16_t* xx = (uint16_t*)dst;
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000066 for (int i = count; i > 0; --i) {
digit@google.comfce02ac2012-08-01 14:25:07 +000067 *xx++ = SkToU16(fx >> 16); fx += dx;
68 }
69}
70
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000071void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
digit@google.comfce02ac2012-08-01 14:25:07 +000072 if (count >= 8) {
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000073 SkFixed dx8 = dx * 8;
74 int32x4_t vdx8 = vdupq_n_s32(dx8);
digit@google.comfce02ac2012-08-01 14:25:07 +000075
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000076 int32x4_t wide_fx, wide_fx2;
digit@google.comfce02ac2012-08-01 14:25:07 +000077 wide_fx = vdupq_n_s32(fx);
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000078 wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1);
79 wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2);
80 wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3);
digit@google.comfce02ac2012-08-01 14:25:07 +000081
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000082 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx));
digit@google.comfce02ac2012-08-01 14:25:07 +000083
84 while (count >= 8) {
85 int32x4_t wide_out;
86 int32x4_t wide_out2;
87
88 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000089 wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1));
digit@google.comfce02ac2012-08-01 14:25:07 +000090
91 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000092 wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1));
digit@google.comfce02ac2012-08-01 14:25:07 +000093
94 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
95 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
96
97 dst += 8;
commit-bot@chromium.orga8c09662013-09-05 18:27:57 +000098 fx += dx8;
99 wide_fx += vdx8;
100 wide_fx2 += vdx8;
digit@google.comfce02ac2012-08-01 14:25:07 +0000101 count -= 8;
102 }
103 }
104
105 if (count & 1)
106 {
107 SkASSERT((fx >> (16 + 14)) == 0);
108 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
109 fx += dx;
110 }
111 while ((count -= 2) >= 0)
112 {
113 SkASSERT((fx >> (16 + 14)) == 0);
114 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
115 fx += dx;
116
117 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
118 fx += dx;
119 }
120}