blob: ce7be794a22f3e341a74ff50898157299b9f390a [file] [log] [blame]
Jason Sams709a0972012-11-15 18:18:04 -08001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21using namespace android;
22using namespace android::renderscript;
23
24namespace android {
25namespace renderscript {
26
27
28class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic {
29public:
30 virtual void populateScript(Script *);
31 virtual void invokeFreeChildren();
32
33 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34 virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35
36 virtual ~RsdCpuScriptIntrinsicConvolve3x3();
Jason Samsc905efd2012-11-26 15:20:18 -080037 RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
Jason Sams709a0972012-11-15 18:18:04 -080038
39protected:
Jason Samsc905efd2012-11-26 15:20:18 -080040 float mFp[16];
41 short mIp[16];
42 ObjectBaseRef<const Allocation> mAlloc;
43 ObjectBaseRef<const Element> mElement;
Jason Sams709a0972012-11-15 18:18:04 -080044
David Grossb0abb142015-03-12 15:23:03 -070045 static void kernelU1(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -070046 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070047 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070048 static void kernelU2(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -070049 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070050 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070051 static void kernelU4(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -070052 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070053 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070054 static void kernelF1(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -070055 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070056 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070057 static void kernelF2(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -070058 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070059 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070060 static void kernelF4(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -070061 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070062 uint32_t outstep);
Jason Sams709a0972012-11-15 18:18:04 -080063};
64
65}
66}
67
68
69void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
70 rsAssert(slot == 1);
Jason Samsc905efd2012-11-26 15:20:18 -080071 mAlloc.set(static_cast<Allocation *>(data));
Jason Sams709a0972012-11-15 18:18:04 -080072}
73
74void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
75 size_t dataLength) {
76 rsAssert(slot == 0);
Jason Samsc905efd2012-11-26 15:20:18 -080077 memcpy (&mFp, data, dataLength);
Jason Sams709a0972012-11-15 18:18:04 -080078 for(int ct=0; ct < 9; ct++) {
Jason Sams3b35d772013-06-25 17:47:02 -070079 if (mFp[ct] >= 0) {
80 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
81 } else {
82 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
83 }
Jason Sams709a0972012-11-15 18:18:04 -080084 }
85}
86
87extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1,
88 const void *y2, const short *coef, uint32_t count);
89
90
David Grossb0abb142015-03-12 15:23:03 -070091static void ConvolveOneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
Jason Sams3b35d772013-06-25 17:47:02 -070092 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
93 const float* coeff) {
Jason Sams709a0972012-11-15 18:18:04 -080094
95 uint32_t x1 = rsMax((int32_t)x-1, 0);
David Grossb0abb142015-03-12 15:23:03 -070096 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
Jason Sams709a0972012-11-15 18:18:04 -080097
98 float4 px = convert_float4(py0[x1]) * coeff[0] +
99 convert_float4(py0[x]) * coeff[1] +
100 convert_float4(py0[x2]) * coeff[2] +
101 convert_float4(py1[x1]) * coeff[3] +
102 convert_float4(py1[x]) * coeff[4] +
103 convert_float4(py1[x2]) * coeff[5] +
104 convert_float4(py2[x1]) * coeff[6] +
105 convert_float4(py2[x]) * coeff[7] +
106 convert_float4(py2[x2]) * coeff[8];
107
Miao Wang4283f572014-11-17 14:59:39 -0800108 px = clamp(px + 0.5f, 0.f, 255.f);
Jason Sams709a0972012-11-15 18:18:04 -0800109 uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
110 *out = o;
111}
112
David Grossb0abb142015-03-12 15:23:03 -0700113static void ConvolveOneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
Jason Sams3b35d772013-06-25 17:47:02 -0700114 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
115 const float* coeff) {
116
117 uint32_t x1 = rsMax((int32_t)x-1, 0);
David Grossb0abb142015-03-12 15:23:03 -0700118 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
Jason Sams3b35d772013-06-25 17:47:02 -0700119
120 float2 px = convert_float2(py0[x1]) * coeff[0] +
121 convert_float2(py0[x]) * coeff[1] +
122 convert_float2(py0[x2]) * coeff[2] +
123 convert_float2(py1[x1]) * coeff[3] +
124 convert_float2(py1[x]) * coeff[4] +
125 convert_float2(py1[x2]) * coeff[5] +
126 convert_float2(py2[x1]) * coeff[6] +
127 convert_float2(py2[x]) * coeff[7] +
128 convert_float2(py2[x2]) * coeff[8];
129
Miao Wang4283f572014-11-17 14:59:39 -0800130 px = clamp(px + 0.5f, 0.f, 255.f);
Jason Sams3b35d772013-06-25 17:47:02 -0700131 *out = convert_uchar2(px);
132}
133
David Grossb0abb142015-03-12 15:23:03 -0700134static void ConvolveOneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
Jason Sams3b35d772013-06-25 17:47:02 -0700135 const uchar *py0, const uchar *py1, const uchar *py2,
136 const float* coeff) {
137
138 uint32_t x1 = rsMax((int32_t)x-1, 0);
David Grossb0abb142015-03-12 15:23:03 -0700139 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
Jason Sams3b35d772013-06-25 17:47:02 -0700140
141 float px = ((float)py0[x1]) * coeff[0] +
142 ((float)py0[x]) * coeff[1] +
143 ((float)py0[x2]) * coeff[2] +
144 ((float)py1[x1]) * coeff[3] +
145 ((float)py1[x]) * coeff[4] +
146 ((float)py1[x2]) * coeff[5] +
147 ((float)py2[x1]) * coeff[6] +
148 ((float)py2[x]) * coeff[7] +
149 ((float)py2[x2]) * coeff[8];
Miao Wang4283f572014-11-17 14:59:39 -0800150 *out = clamp(px + 0.5f, 0.f, 255.f);
Jason Sams3b35d772013-06-25 17:47:02 -0700151}
152
David Grossb0abb142015-03-12 15:23:03 -0700153static void ConvolveOneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
Jason Sams3b35d772013-06-25 17:47:02 -0700154 const float4 *py0, const float4 *py1, const float4 *py2,
155 const float* coeff) {
156
157 uint32_t x1 = rsMax((int32_t)x-1, 0);
David Grossb0abb142015-03-12 15:23:03 -0700158 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
Jason Sams3b35d772013-06-25 17:47:02 -0700159 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
160 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
161 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
162}
163
David Grossb0abb142015-03-12 15:23:03 -0700164static void ConvolveOneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
Jason Sams3b35d772013-06-25 17:47:02 -0700165 const float2 *py0, const float2 *py1, const float2 *py2,
166 const float* coeff) {
167
168 uint32_t x1 = rsMax((int32_t)x-1, 0);
David Grossb0abb142015-03-12 15:23:03 -0700169 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
Jason Sams3b35d772013-06-25 17:47:02 -0700170 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
171 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
172 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
173}
174
David Grossb0abb142015-03-12 15:23:03 -0700175static void ConvolveOneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
Jason Sams3b35d772013-06-25 17:47:02 -0700176 const float *py0, const float *py1, const float *py2,
177 const float* coeff) {
178
179 uint32_t x1 = rsMax((int32_t)x-1, 0);
David Grossb0abb142015-03-12 15:23:03 -0700180 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
Jason Sams3b35d772013-06-25 17:47:02 -0700181 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
182 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
183 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
184}
185
David Grossb0abb142015-03-12 15:23:03 -0700186void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -0700187 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700188 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700189 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
Jason Sams709a0972012-11-15 18:18:04 -0800190
Jason Samsc905efd2012-11-26 15:20:18 -0800191 if (!cp->mAlloc.get()) {
Jason Sams709a0972012-11-15 18:18:04 -0800192 ALOGE("Convolve3x3 executed without input, skipping");
193 return;
194 }
Jason Samsc905efd2012-11-26 15:20:18 -0800195 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
196 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
Jason Sams709a0972012-11-15 18:18:04 -0800197
David Grossb0abb142015-03-12 15:23:03 -0700198 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
199 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
Jason Sams709a0972012-11-15 18:18:04 -0800200 const uchar4 *py0 = (const uchar4 *)(pin + stride * y2);
David Grossb0abb142015-03-12 15:23:03 -0700201 const uchar4 *py1 = (const uchar4 *)(pin + stride * info->current.y);
Jason Sams709a0972012-11-15 18:18:04 -0800202 const uchar4 *py2 = (const uchar4 *)(pin + stride * y1);
203
David Grossb0abb142015-03-12 15:23:03 -0700204 uchar4 *out = (uchar4 *)info->outPtr[0];
Jason Sams709a0972012-11-15 18:18:04 -0800205 uint32_t x1 = xstart;
206 uint32_t x2 = xend;
207 if(x1 == 0) {
David Grossb0abb142015-03-12 15:23:03 -0700208 ConvolveOneU4(info, 0, out, py0, py1, py2, cp->mFp);
Jason Sams709a0972012-11-15 18:18:04 -0800209 x1 ++;
210 out++;
211 }
212
213 if(x2 > x1) {
Jason Sams074424a2014-05-22 13:30:03 -0700214#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
Jason Samsf5ef8df2013-08-06 13:49:25 -0700215 if (gArchUseSIMD) {
216 int32_t len = (x2 - x1 - 1) >> 1;
217 if(len > 0) {
218 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
219 x1 += len << 1;
220 out += len << 1;
221 }
Jason Sams709a0972012-11-15 18:18:04 -0800222 }
223#endif
224
225 while(x1 != x2) {
David Grossb0abb142015-03-12 15:23:03 -0700226 ConvolveOneU4(info, x1, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700227 out++;
228 x1++;
229 }
230 }
231}
232
David Grossb0abb142015-03-12 15:23:03 -0700233void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -0700234 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700235 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700236 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
Jason Sams3b35d772013-06-25 17:47:02 -0700237
238 if (!cp->mAlloc.get()) {
239 ALOGE("Convolve3x3 executed without input, skipping");
240 return;
241 }
242 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
243 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
244
David Grossb0abb142015-03-12 15:23:03 -0700245 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
246 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
Jason Sams3b35d772013-06-25 17:47:02 -0700247 const uchar2 *py0 = (const uchar2 *)(pin + stride * y2);
David Grossb0abb142015-03-12 15:23:03 -0700248 const uchar2 *py1 = (const uchar2 *)(pin + stride * info->current.y);
Jason Sams3b35d772013-06-25 17:47:02 -0700249 const uchar2 *py2 = (const uchar2 *)(pin + stride * y1);
250
David Grossb0abb142015-03-12 15:23:03 -0700251 uchar2 *out = (uchar2 *)info->outPtr[0];
Jason Sams3b35d772013-06-25 17:47:02 -0700252 uint32_t x1 = xstart;
253 uint32_t x2 = xend;
254 if(x1 == 0) {
David Grossb0abb142015-03-12 15:23:03 -0700255 ConvolveOneU2(info, 0, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700256 x1 ++;
257 out++;
258 }
259
260 if(x2 > x1) {
261#if 0//defined(ARCH_ARM_HAVE_NEON)
262 int32_t len = (x2 - x1 - 1) >> 1;
263 if(len > 0) {
264 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
265 x1 += len << 1;
266 out += len << 1;
267 }
268#endif
269
270 while(x1 != x2) {
David Grossb0abb142015-03-12 15:23:03 -0700271 ConvolveOneU2(info, x1, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700272 out++;
273 x1++;
274 }
275 }
276}
277
David Grossb0abb142015-03-12 15:23:03 -0700278void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -0700279 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700280 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700281 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
Jason Sams3b35d772013-06-25 17:47:02 -0700282
283 if (!cp->mAlloc.get()) {
284 ALOGE("Convolve3x3 executed without input, skipping");
285 return;
286 }
287 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
288 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
289
David Grossb0abb142015-03-12 15:23:03 -0700290 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
291 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
Jason Sams3b35d772013-06-25 17:47:02 -0700292 const uchar *py0 = (const uchar *)(pin + stride * y2);
David Grossb0abb142015-03-12 15:23:03 -0700293 const uchar *py1 = (const uchar *)(pin + stride * info->current.y);
Jason Sams3b35d772013-06-25 17:47:02 -0700294 const uchar *py2 = (const uchar *)(pin + stride * y1);
295
David Grossb0abb142015-03-12 15:23:03 -0700296 uchar *out = (uchar *)info->outPtr[0];
Jason Sams3b35d772013-06-25 17:47:02 -0700297 uint32_t x1 = xstart;
298 uint32_t x2 = xend;
299 if(x1 == 0) {
David Grossb0abb142015-03-12 15:23:03 -0700300 ConvolveOneU1(info, 0, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700301 x1 ++;
302 out++;
303 }
304
305 if(x2 > x1) {
306#if 0//defined(ARCH_ARM_HAVE_NEON)
307 int32_t len = (x2 - x1 - 1) >> 1;
308 if(len > 0) {
309 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
310 x1 += len << 1;
311 out += len << 1;
312 }
313#endif
314
315 while(x1 != x2) {
David Grossb0abb142015-03-12 15:23:03 -0700316 ConvolveOneU1(info, x1, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700317 out++;
318 x1++;
319 }
320 }
321}
322
David Grossb0abb142015-03-12 15:23:03 -0700323void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -0700324 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700325 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700326 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
Jason Sams3b35d772013-06-25 17:47:02 -0700327
328 if (!cp->mAlloc.get()) {
329 ALOGE("Convolve3x3 executed without input, skipping");
330 return;
331 }
332 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
333 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
334
David Grossb0abb142015-03-12 15:23:03 -0700335 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
336 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
Jason Sams3b35d772013-06-25 17:47:02 -0700337 const float4 *py0 = (const float4 *)(pin + stride * y2);
David Grossb0abb142015-03-12 15:23:03 -0700338 const float4 *py1 = (const float4 *)(pin + stride * info->current.y);
Jason Sams3b35d772013-06-25 17:47:02 -0700339 const float4 *py2 = (const float4 *)(pin + stride * y1);
340
David Grossb0abb142015-03-12 15:23:03 -0700341 float4 *out = (float4 *)info->outPtr[0];
Jason Sams3b35d772013-06-25 17:47:02 -0700342 uint32_t x1 = xstart;
343 uint32_t x2 = xend;
344 if(x1 == 0) {
David Grossb0abb142015-03-12 15:23:03 -0700345 ConvolveOneF4(info, 0, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700346 x1 ++;
347 out++;
348 }
349
350 if(x2 > x1) {
351#if 0//defined(ARCH_ARM_HAVE_NEON)
352 int32_t len = (x2 - x1 - 1) >> 1;
353 if(len > 0) {
354 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
355 x1 += len << 1;
356 out += len << 1;
357 }
358#endif
359
360 while(x1 != x2) {
David Grossb0abb142015-03-12 15:23:03 -0700361 ConvolveOneF4(info, x1, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700362 out++;
363 x1++;
364 }
365 }
366}
367
David Grossb0abb142015-03-12 15:23:03 -0700368void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -0700369 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700370 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700371 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
Jason Sams3b35d772013-06-25 17:47:02 -0700372
373 if (!cp->mAlloc.get()) {
374 ALOGE("Convolve3x3 executed without input, skipping");
375 return;
376 }
377 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
378 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
379
David Grossb0abb142015-03-12 15:23:03 -0700380 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
381 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
Jason Sams3b35d772013-06-25 17:47:02 -0700382 const float2 *py0 = (const float2 *)(pin + stride * y2);
David Grossb0abb142015-03-12 15:23:03 -0700383 const float2 *py1 = (const float2 *)(pin + stride * info->current.y);
Jason Sams3b35d772013-06-25 17:47:02 -0700384 const float2 *py2 = (const float2 *)(pin + stride * y1);
385
David Grossb0abb142015-03-12 15:23:03 -0700386 float2 *out = (float2 *)info->outPtr[0];
Jason Sams3b35d772013-06-25 17:47:02 -0700387 uint32_t x1 = xstart;
388 uint32_t x2 = xend;
389 if(x1 == 0) {
David Grossb0abb142015-03-12 15:23:03 -0700390 ConvolveOneF2(info, 0, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700391 x1 ++;
392 out++;
393 }
394
395 if(x2 > x1) {
396#if 0//defined(ARCH_ARM_HAVE_NEON)
397 int32_t len = (x2 - x1 - 1) >> 1;
398 if(len > 0) {
399 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
400 x1 += len << 1;
401 out += len << 1;
402 }
403#endif
404
405 while(x1 != x2) {
David Grossb0abb142015-03-12 15:23:03 -0700406 ConvolveOneF2(info, x1, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700407 out++;
408 x1++;
409 }
410 }
411}
David Grossb0abb142015-03-12 15:23:03 -0700412void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelDriverInfo *info,
Jason Sams3b35d772013-06-25 17:47:02 -0700413 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700414 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700415 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
Jason Sams3b35d772013-06-25 17:47:02 -0700416
417 if (!cp->mAlloc.get()) {
418 ALOGE("Convolve3x3 executed without input, skipping");
419 return;
420 }
421 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
422 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
423
David Grossb0abb142015-03-12 15:23:03 -0700424 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
425 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
Jason Sams3b35d772013-06-25 17:47:02 -0700426 const float *py0 = (const float *)(pin + stride * y2);
David Grossb0abb142015-03-12 15:23:03 -0700427 const float *py1 = (const float *)(pin + stride * info->current.y);
Jason Sams3b35d772013-06-25 17:47:02 -0700428 const float *py2 = (const float *)(pin + stride * y1);
429
David Grossb0abb142015-03-12 15:23:03 -0700430 float *out = (float *)info->outPtr[0];
Jason Sams3b35d772013-06-25 17:47:02 -0700431 uint32_t x1 = xstart;
432 uint32_t x2 = xend;
433 if(x1 == 0) {
David Grossb0abb142015-03-12 15:23:03 -0700434 ConvolveOneF1(info, 0, out, py0, py1, py2, cp->mFp);
Jason Sams3b35d772013-06-25 17:47:02 -0700435 x1 ++;
436 out++;
437 }
438
439 if(x2 > x1) {
440#if 0//defined(ARCH_ARM_HAVE_NEON)
441 int32_t len = (x2 - x1 - 1) >> 1;
442 if(len > 0) {
443 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
444 x1 += len << 1;
445 out += len << 1;
446 }
447#endif
448
449 while(x1 != x2) {
David Grossb0abb142015-03-12 15:23:03 -0700450 ConvolveOneF1(info, x1, out, py0, py1, py2, cp->mFp);
Jason Sams709a0972012-11-15 18:18:04 -0800451 out++;
452 x1++;
453 }
454 }
455}
456
457RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
Jason Samsc905efd2012-11-26 15:20:18 -0800458 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
459 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
Jason Sams709a0972012-11-15 18:18:04 -0800460
Jason Sams3b35d772013-06-25 17:47:02 -0700461 if (e->getType() == RS_TYPE_FLOAT_32) {
462 switch(e->getVectorSize()) {
463 case 1:
464 mRootPtr = &kernelF1;
465 break;
466 case 2:
467 mRootPtr = &kernelF2;
468 break;
469 case 3:
470 case 4:
471 mRootPtr = &kernelF4;
472 break;
473 }
474 } else {
475 switch(e->getVectorSize()) {
476 case 1:
477 mRootPtr = &kernelU1;
478 break;
479 case 2:
480 mRootPtr = &kernelU2;
481 break;
482 case 3:
483 case 4:
484 mRootPtr = &kernelU4;
485 break;
486 }
487 }
Jason Sams709a0972012-11-15 18:18:04 -0800488 for(int ct=0; ct < 9; ct++) {
Jason Samsc905efd2012-11-26 15:20:18 -0800489 mFp[ct] = 1.f / 9.f;
Jason Sams3b35d772013-06-25 17:47:02 -0700490 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
Jason Sams709a0972012-11-15 18:18:04 -0800491 }
492}
493
494RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() {
495}
496
497void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) {
498 s->mHal.info.exportedVariableCount = 2;
499}
500
501void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
Jason Samsc905efd2012-11-26 15:20:18 -0800502 mAlloc.clear();
Jason Sams709a0972012-11-15 18:18:04 -0800503}
504
505
Jason Samsc905efd2012-11-26 15:20:18 -0800506RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
Jason Sams709a0972012-11-15 18:18:04 -0800507
Jason Samsc905efd2012-11-26 15:20:18 -0800508 return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800509}