blob: 92e739febe86f4d96af6b0a6d75d4e75b604e207 [file] [log] [blame]
Jason Sams709a0972012-11-15 18:18:04 -08001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21using namespace android;
22using namespace android::renderscript;
23
24namespace android {
25namespace renderscript {
26
27
28class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
29public:
Stephen Hinesc060f142015-05-13 19:26:09 -070030 void populateScript(Script *) override;
31 void invokeFreeChildren() override;
Jason Sams709a0972012-11-15 18:18:04 -080032
Stephen Hinesc060f142015-05-13 19:26:09 -070033 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
34 void setGlobalObj(uint32_t slot, ObjectBase *data) override;
Jason Sams709a0972012-11-15 18:18:04 -080035
Stephen Hinesc060f142015-05-13 19:26:09 -070036 ~RsdCpuScriptIntrinsicConvolve5x5() override;
Jason Samsc905efd2012-11-26 15:20:18 -080037 RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
Jason Sams709a0972012-11-15 18:18:04 -080038
39protected:
Jason Sams34b0d312013-06-26 15:34:02 -070040 float mFp[28];
41 short mIp[28];
Jason Sams709a0972012-11-15 18:18:04 -080042 ObjectBaseRef<Allocation> alloc;
43
44
David Grossb0abb142015-03-12 15:23:03 -070045 static void kernelU1(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -070046 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070047 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070048 static void kernelU2(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -070049 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070050 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070051 static void kernelU4(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -070052 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070053 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070054 static void kernelF1(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -070055 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070056 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070057 static void kernelF2(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -070058 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070059 uint32_t outstep);
David Grossb0abb142015-03-12 15:23:03 -070060 static void kernelF4(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -070061 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -070062 uint32_t outstep);
Jason Sams709a0972012-11-15 18:18:04 -080063
64
65};
66
67}
68}
69
70void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
71 rsAssert(slot == 1);
72 alloc.set(static_cast<Allocation *>(data));
73}
74
75void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
76 const void *data, size_t dataLength) {
77 rsAssert(slot == 0);
Jason Sams34b0d312013-06-26 15:34:02 -070078 memcpy (&mFp, data, dataLength);
Jason Sams709a0972012-11-15 18:18:04 -080079 for(int ct=0; ct < 25; ct++) {
Jason Sams34b0d312013-06-26 15:34:02 -070080 if (mFp[ct] >= 0) {
81 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
82 } else {
83 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
84 }
Jason Sams709a0972012-11-15 18:18:04 -080085 }
86}
87
88
David Grossb0abb142015-03-12 15:23:03 -070089static void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
Jason Sams34b0d312013-06-26 15:34:02 -070090 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
91 const float* coeff) {
Jason Sams709a0972012-11-15 18:18:04 -080092
93 uint32_t x0 = rsMax((int32_t)x-2, 0);
94 uint32_t x1 = rsMax((int32_t)x-1, 0);
95 uint32_t x2 = x;
David Grossb0abb142015-03-12 15:23:03 -070096 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
97 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
Jason Sams709a0972012-11-15 18:18:04 -080098
99 float4 px = convert_float4(py0[x0]) * coeff[0] +
100 convert_float4(py0[x1]) * coeff[1] +
101 convert_float4(py0[x2]) * coeff[2] +
102 convert_float4(py0[x3]) * coeff[3] +
103 convert_float4(py0[x4]) * coeff[4] +
104
105 convert_float4(py1[x0]) * coeff[5] +
106 convert_float4(py1[x1]) * coeff[6] +
107 convert_float4(py1[x2]) * coeff[7] +
108 convert_float4(py1[x3]) * coeff[8] +
109 convert_float4(py1[x4]) * coeff[9] +
110
111 convert_float4(py2[x0]) * coeff[10] +
112 convert_float4(py2[x1]) * coeff[11] +
113 convert_float4(py2[x2]) * coeff[12] +
114 convert_float4(py2[x3]) * coeff[13] +
115 convert_float4(py2[x4]) * coeff[14] +
116
117 convert_float4(py3[x0]) * coeff[15] +
118 convert_float4(py3[x1]) * coeff[16] +
119 convert_float4(py3[x2]) * coeff[17] +
120 convert_float4(py3[x3]) * coeff[18] +
121 convert_float4(py3[x4]) * coeff[19] +
122
123 convert_float4(py4[x0]) * coeff[20] +
124 convert_float4(py4[x1]) * coeff[21] +
125 convert_float4(py4[x2]) * coeff[22] +
126 convert_float4(py4[x3]) * coeff[23] +
127 convert_float4(py4[x4]) * coeff[24];
Miao Wang4283f572014-11-17 14:59:39 -0800128 px = clamp(px + 0.5f, 0.f, 255.f);
Jason Sams34b0d312013-06-26 15:34:02 -0700129 *out = convert_uchar4(px);
Jason Sams709a0972012-11-15 18:18:04 -0800130}
131
David Grossb0abb142015-03-12 15:23:03 -0700132static void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
Jason Sams34b0d312013-06-26 15:34:02 -0700133 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
134 const float* coeff) {
135
136 uint32_t x0 = rsMax((int32_t)x-2, 0);
137 uint32_t x1 = rsMax((int32_t)x-1, 0);
138 uint32_t x2 = x;
David Grossb0abb142015-03-12 15:23:03 -0700139 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
140 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700141
142 float2 px = convert_float2(py0[x0]) * coeff[0] +
143 convert_float2(py0[x1]) * coeff[1] +
144 convert_float2(py0[x2]) * coeff[2] +
145 convert_float2(py0[x3]) * coeff[3] +
146 convert_float2(py0[x4]) * coeff[4] +
147
148 convert_float2(py1[x0]) * coeff[5] +
149 convert_float2(py1[x1]) * coeff[6] +
150 convert_float2(py1[x2]) * coeff[7] +
151 convert_float2(py1[x3]) * coeff[8] +
152 convert_float2(py1[x4]) * coeff[9] +
153
154 convert_float2(py2[x0]) * coeff[10] +
155 convert_float2(py2[x1]) * coeff[11] +
156 convert_float2(py2[x2]) * coeff[12] +
157 convert_float2(py2[x3]) * coeff[13] +
158 convert_float2(py2[x4]) * coeff[14] +
159
160 convert_float2(py3[x0]) * coeff[15] +
161 convert_float2(py3[x1]) * coeff[16] +
162 convert_float2(py3[x2]) * coeff[17] +
163 convert_float2(py3[x3]) * coeff[18] +
164 convert_float2(py3[x4]) * coeff[19] +
165
166 convert_float2(py4[x0]) * coeff[20] +
167 convert_float2(py4[x1]) * coeff[21] +
168 convert_float2(py4[x2]) * coeff[22] +
169 convert_float2(py4[x3]) * coeff[23] +
170 convert_float2(py4[x4]) * coeff[24];
Miao Wang4283f572014-11-17 14:59:39 -0800171 px = clamp(px + 0.5f, 0.f, 255.f);
Jason Sams34b0d312013-06-26 15:34:02 -0700172 *out = convert_uchar2(px);
173}
174
David Grossb0abb142015-03-12 15:23:03 -0700175static void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
Jason Sams34b0d312013-06-26 15:34:02 -0700176 const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
177 const float* coeff) {
178
179 uint32_t x0 = rsMax((int32_t)x-2, 0);
180 uint32_t x1 = rsMax((int32_t)x-1, 0);
181 uint32_t x2 = x;
David Grossb0abb142015-03-12 15:23:03 -0700182 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
183 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700184
185 float px = (float)(py0[x0]) * coeff[0] +
186 (float)(py0[x1]) * coeff[1] +
187 (float)(py0[x2]) * coeff[2] +
188 (float)(py0[x3]) * coeff[3] +
189 (float)(py0[x4]) * coeff[4] +
190
191 (float)(py1[x0]) * coeff[5] +
192 (float)(py1[x1]) * coeff[6] +
193 (float)(py1[x2]) * coeff[7] +
194 (float)(py1[x3]) * coeff[8] +
195 (float)(py1[x4]) * coeff[9] +
196
197 (float)(py2[x0]) * coeff[10] +
198 (float)(py2[x1]) * coeff[11] +
199 (float)(py2[x2]) * coeff[12] +
200 (float)(py2[x3]) * coeff[13] +
201 (float)(py2[x4]) * coeff[14] +
202
203 (float)(py3[x0]) * coeff[15] +
204 (float)(py3[x1]) * coeff[16] +
205 (float)(py3[x2]) * coeff[17] +
206 (float)(py3[x3]) * coeff[18] +
207 (float)(py3[x4]) * coeff[19] +
208
209 (float)(py4[x0]) * coeff[20] +
210 (float)(py4[x1]) * coeff[21] +
211 (float)(py4[x2]) * coeff[22] +
212 (float)(py4[x3]) * coeff[23] +
213 (float)(py4[x4]) * coeff[24];
Miao Wang4283f572014-11-17 14:59:39 -0800214 px = clamp(px + 0.5f, 0.f, 255.f);
Jason Sams34b0d312013-06-26 15:34:02 -0700215 *out = px;
216}
217
David Grossb0abb142015-03-12 15:23:03 -0700218static void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
Jason Sams34b0d312013-06-26 15:34:02 -0700219 const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
220 const float* coeff) {
221
222 uint32_t x0 = rsMax((int32_t)x-2, 0);
223 uint32_t x1 = rsMax((int32_t)x-1, 0);
224 uint32_t x2 = x;
David Grossb0abb142015-03-12 15:23:03 -0700225 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
226 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700227
228 float4 px = py0[x0] * coeff[0] +
229 py0[x1] * coeff[1] +
230 py0[x2] * coeff[2] +
231 py0[x3] * coeff[3] +
232 py0[x4] * coeff[4] +
233
234 py1[x0] * coeff[5] +
235 py1[x1] * coeff[6] +
236 py1[x2] * coeff[7] +
237 py1[x3] * coeff[8] +
238 py1[x4] * coeff[9] +
239
240 py2[x0] * coeff[10] +
241 py2[x1] * coeff[11] +
242 py2[x2] * coeff[12] +
243 py2[x3] * coeff[13] +
244 py2[x4] * coeff[14] +
245
246 py3[x0] * coeff[15] +
247 py3[x1] * coeff[16] +
248 py3[x2] * coeff[17] +
249 py3[x3] * coeff[18] +
250 py3[x4] * coeff[19] +
251
252 py4[x0] * coeff[20] +
253 py4[x1] * coeff[21] +
254 py4[x2] * coeff[22] +
255 py4[x3] * coeff[23] +
256 py4[x4] * coeff[24];
257 *out = px;
258}
259
David Grossb0abb142015-03-12 15:23:03 -0700260static void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
Jason Sams34b0d312013-06-26 15:34:02 -0700261 const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
262 const float* coeff) {
263
264 uint32_t x0 = rsMax((int32_t)x-2, 0);
265 uint32_t x1 = rsMax((int32_t)x-1, 0);
266 uint32_t x2 = x;
David Grossb0abb142015-03-12 15:23:03 -0700267 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
268 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700269
270 float2 px = py0[x0] * coeff[0] +
271 py0[x1] * coeff[1] +
272 py0[x2] * coeff[2] +
273 py0[x3] * coeff[3] +
274 py0[x4] * coeff[4] +
275
276 py1[x0] * coeff[5] +
277 py1[x1] * coeff[6] +
278 py1[x2] * coeff[7] +
279 py1[x3] * coeff[8] +
280 py1[x4] * coeff[9] +
281
282 py2[x0] * coeff[10] +
283 py2[x1] * coeff[11] +
284 py2[x2] * coeff[12] +
285 py2[x3] * coeff[13] +
286 py2[x4] * coeff[14] +
287
288 py3[x0] * coeff[15] +
289 py3[x1] * coeff[16] +
290 py3[x2] * coeff[17] +
291 py3[x3] * coeff[18] +
292 py3[x4] * coeff[19] +
293
294 py4[x0] * coeff[20] +
295 py4[x1] * coeff[21] +
296 py4[x2] * coeff[22] +
297 py4[x3] * coeff[23] +
298 py4[x4] * coeff[24];
299 *out = px;
300}
301
David Grossb0abb142015-03-12 15:23:03 -0700302static void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
Jason Sams34b0d312013-06-26 15:34:02 -0700303 const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
304 const float* coeff) {
305
306 uint32_t x0 = rsMax((int32_t)x-2, 0);
307 uint32_t x1 = rsMax((int32_t)x-1, 0);
308 uint32_t x2 = x;
David Grossb0abb142015-03-12 15:23:03 -0700309 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
310 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700311
312 float px = py0[x0] * coeff[0] +
313 py0[x1] * coeff[1] +
314 py0[x2] * coeff[2] +
315 py0[x3] * coeff[3] +
316 py0[x4] * coeff[4] +
317
318 py1[x0] * coeff[5] +
319 py1[x1] * coeff[6] +
320 py1[x2] * coeff[7] +
321 py1[x3] * coeff[8] +
322 py1[x4] * coeff[9] +
323
324 py2[x0] * coeff[10] +
325 py2[x1] * coeff[11] +
326 py2[x2] * coeff[12] +
327 py2[x3] * coeff[13] +
328 py2[x4] * coeff[14] +
329
330 py3[x0] * coeff[15] +
331 py3[x1] * coeff[16] +
332 py3[x2] * coeff[17] +
333 py3[x3] * coeff[18] +
334 py3[x4] * coeff[19] +
335
336 py4[x0] * coeff[20] +
337 py4[x1] * coeff[21] +
338 py4[x2] * coeff[22] +
339 py4[x3] * coeff[23] +
340 py4[x4] * coeff[24];
341 *out = px;
342}
343
344
Jason Sams709a0972012-11-15 18:18:04 -0800345extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
346 const void *y2, const void *y3, const void *y4,
347 const short *coef, uint32_t count);
348
David Grossb0abb142015-03-12 15:23:03 -0700349void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -0700350 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700351 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700352 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
Jason Sams709a0972012-11-15 18:18:04 -0800353 if (!cp->alloc.get()) {
354 ALOGE("Convolve5x5 executed without input, skipping");
355 return;
356 }
357 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
358 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
359
David Grossb0abb142015-03-12 15:23:03 -0700360 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
361 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
362 uint32_t y2 = info->current.y;
363 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
364 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
Jason Sams709a0972012-11-15 18:18:04 -0800365
366 const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
367 const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
368 const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
369 const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
370 const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
371
David Grossb0abb142015-03-12 15:23:03 -0700372 uchar4 *out = (uchar4 *)info->outPtr[0];
Jason Sams709a0972012-11-15 18:18:04 -0800373 uint32_t x1 = xstart;
374 uint32_t x2 = xend;
375
376 while((x1 < x2) && (x1 < 2)) {
David Grossb0abb142015-03-12 15:23:03 -0700377 OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams709a0972012-11-15 18:18:04 -0800378 out++;
379 x1++;
380 }
Rose, James7b7060c2014-04-22 12:08:06 +0800381#if defined(ARCH_X86_HAVE_SSSE3)
382 // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
383 // 3 for end boundary where x may hit the end boundary)
384 if (gArchUseSIMD &&((x1 + 6) < x2)) {
385 // subtract 3 for end boundary
386 uint32_t len = (x2 - x1 - 3) >> 2;
Yong Chen45d29c42014-09-15 10:32:38 +0800387 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
Rose, James7b7060c2014-04-22 12:08:06 +0800388 out += len << 2;
389 x1 += len << 2;
390 }
391#endif
Jason Sams709a0972012-11-15 18:18:04 -0800392
Jason Sams074424a2014-05-22 13:30:03 -0700393#if defined(ARCH_ARM_USE_INTRINSICS)
Jason Samsf5ef8df2013-08-06 13:49:25 -0700394 if(gArchUseSIMD && ((x1 + 3) < x2)) {
Jason Sams709a0972012-11-15 18:18:04 -0800395 uint32_t len = (x2 - x1 - 3) >> 1;
Jason Samsde52a832014-08-20 17:22:57 -0700396 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
Jason Sams34b0d312013-06-26 15:34:02 -0700397 out += len << 1;
398 x1 += len << 1;
399 }
400#endif
401
402 while(x1 < x2) {
David Grossb0abb142015-03-12 15:23:03 -0700403 OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700404 out++;
405 x1++;
406 }
407}
408
David Grossb0abb142015-03-12 15:23:03 -0700409void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -0700410 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700411 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700412 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
Jason Sams34b0d312013-06-26 15:34:02 -0700413 if (!cp->alloc.get()) {
414 ALOGE("Convolve5x5 executed without input, skipping");
415 return;
416 }
417 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
418 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
419
David Grossb0abb142015-03-12 15:23:03 -0700420 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
421 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
422 uint32_t y2 = info->current.y;
423 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
424 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700425
426 const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
427 const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
428 const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
429 const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
430 const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
431
David Grossb0abb142015-03-12 15:23:03 -0700432 uchar2 *out = (uchar2 *)info->outPtr[0];
Jason Sams34b0d312013-06-26 15:34:02 -0700433 uint32_t x1 = xstart;
434 uint32_t x2 = xend;
435
436 while((x1 < x2) && (x1 < 2)) {
David Grossb0abb142015-03-12 15:23:03 -0700437 OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700438 out++;
439 x1++;
440 }
441
442#if 0//defined(ARCH_ARM_HAVE_NEON)
443 if((x1 + 3) < x2) {
444 uint32_t len = (x2 - x1 - 3) >> 1;
Jason Sams709a0972012-11-15 18:18:04 -0800445 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
446 out += len << 1;
447 x1 += len << 1;
448 }
449#endif
450
451 while(x1 < x2) {
David Grossb0abb142015-03-12 15:23:03 -0700452 OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams709a0972012-11-15 18:18:04 -0800453 out++;
454 x1++;
455 }
456}
457
David Grossb0abb142015-03-12 15:23:03 -0700458void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -0700459 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700460 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700461 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
Jason Sams34b0d312013-06-26 15:34:02 -0700462 if (!cp->alloc.get()) {
463 ALOGE("Convolve5x5 executed without input, skipping");
464 return;
465 }
466 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
467 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
468
David Grossb0abb142015-03-12 15:23:03 -0700469 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
470 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
471 uint32_t y2 = info->current.y;
472 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
473 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700474
475 const uchar *py0 = (const uchar *)(pin + stride * y0);
476 const uchar *py1 = (const uchar *)(pin + stride * y1);
477 const uchar *py2 = (const uchar *)(pin + stride * y2);
478 const uchar *py3 = (const uchar *)(pin + stride * y3);
479 const uchar *py4 = (const uchar *)(pin + stride * y4);
480
David Grossb0abb142015-03-12 15:23:03 -0700481 uchar *out = (uchar *)info->outPtr[0];
Jason Sams34b0d312013-06-26 15:34:02 -0700482 uint32_t x1 = xstart;
483 uint32_t x2 = xend;
484
485 while((x1 < x2) && (x1 < 2)) {
David Grossb0abb142015-03-12 15:23:03 -0700486 OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700487 out++;
488 x1++;
489 }
490
491#if 0//defined(ARCH_ARM_HAVE_NEON)
492 if((x1 + 3) < x2) {
493 uint32_t len = (x2 - x1 - 3) >> 1;
494 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
495 out += len << 1;
496 x1 += len << 1;
497 }
498#endif
499
500 while(x1 < x2) {
David Grossb0abb142015-03-12 15:23:03 -0700501 OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700502 out++;
503 x1++;
504 }
505}
506
David Grossb0abb142015-03-12 15:23:03 -0700507void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -0700508 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700509 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700510 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
Jason Sams34b0d312013-06-26 15:34:02 -0700511 if (!cp->alloc.get()) {
512 ALOGE("Convolve5x5 executed without input, skipping");
513 return;
514 }
515 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
516 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
517
David Grossb0abb142015-03-12 15:23:03 -0700518 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
519 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
520 uint32_t y2 = info->current.y;
521 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
522 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700523
524 const float4 *py0 = (const float4 *)(pin + stride * y0);
525 const float4 *py1 = (const float4 *)(pin + stride * y1);
526 const float4 *py2 = (const float4 *)(pin + stride * y2);
527 const float4 *py3 = (const float4 *)(pin + stride * y3);
528 const float4 *py4 = (const float4 *)(pin + stride * y4);
529
David Grossb0abb142015-03-12 15:23:03 -0700530 float4 *out = (float4 *)info->outPtr[0];
Jason Sams34b0d312013-06-26 15:34:02 -0700531 uint32_t x1 = xstart;
532 uint32_t x2 = xend;
533
534 while((x1 < x2) && (x1 < 2)) {
David Grossb0abb142015-03-12 15:23:03 -0700535 OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700536 out++;
537 x1++;
538 }
539
540#if 0//defined(ARCH_ARM_HAVE_NEON)
541 if((x1 + 3) < x2) {
542 uint32_t len = (x2 - x1 - 3) >> 1;
543 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
544 out += len << 1;
545 x1 += len << 1;
546 }
547#endif
548
549 while(x1 < x2) {
David Grossb0abb142015-03-12 15:23:03 -0700550 OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700551 out++;
552 x1++;
553 }
554}
555
David Grossb0abb142015-03-12 15:23:03 -0700556void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -0700557 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700558 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700559 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
Jason Sams34b0d312013-06-26 15:34:02 -0700560 if (!cp->alloc.get()) {
561 ALOGE("Convolve5x5 executed without input, skipping");
562 return;
563 }
564 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
565 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
566
David Grossb0abb142015-03-12 15:23:03 -0700567 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
568 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
569 uint32_t y2 = info->current.y;
570 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
571 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700572
573 const float2 *py0 = (const float2 *)(pin + stride * y0);
574 const float2 *py1 = (const float2 *)(pin + stride * y1);
575 const float2 *py2 = (const float2 *)(pin + stride * y2);
576 const float2 *py3 = (const float2 *)(pin + stride * y3);
577 const float2 *py4 = (const float2 *)(pin + stride * y4);
578
David Grossb0abb142015-03-12 15:23:03 -0700579 float2 *out = (float2 *)info->outPtr[0];
Jason Sams34b0d312013-06-26 15:34:02 -0700580 uint32_t x1 = xstart;
581 uint32_t x2 = xend;
582
583 while((x1 < x2) && (x1 < 2)) {
David Grossb0abb142015-03-12 15:23:03 -0700584 OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700585 out++;
586 x1++;
587 }
588
589#if 0//defined(ARCH_ARM_HAVE_NEON)
590 if((x1 + 3) < x2) {
591 uint32_t len = (x2 - x1 - 3) >> 1;
592 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
593 out += len << 1;
594 x1 += len << 1;
595 }
596#endif
597
598 while(x1 < x2) {
David Grossb0abb142015-03-12 15:23:03 -0700599 OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700600 out++;
601 x1++;
602 }
603}
604
David Grossb0abb142015-03-12 15:23:03 -0700605void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info,
Jason Sams34b0d312013-06-26 15:34:02 -0700606 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700607 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700608 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
Jason Sams34b0d312013-06-26 15:34:02 -0700609 if (!cp->alloc.get()) {
610 ALOGE("Convolve5x5 executed without input, skipping");
611 return;
612 }
613 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
614 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
615
David Grossb0abb142015-03-12 15:23:03 -0700616 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
617 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
618 uint32_t y2 = info->current.y;
619 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
620 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
Jason Sams34b0d312013-06-26 15:34:02 -0700621
622 const float *py0 = (const float *)(pin + stride * y0);
623 const float *py1 = (const float *)(pin + stride * y1);
624 const float *py2 = (const float *)(pin + stride * y2);
625 const float *py3 = (const float *)(pin + stride * y3);
626 const float *py4 = (const float *)(pin + stride * y4);
627
David Grossb0abb142015-03-12 15:23:03 -0700628 float *out = (float *)info->outPtr[0];
Jason Sams34b0d312013-06-26 15:34:02 -0700629 uint32_t x1 = xstart;
630 uint32_t x2 = xend;
631
632 while((x1 < x2) && (x1 < 2)) {
David Grossb0abb142015-03-12 15:23:03 -0700633 OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700634 out++;
635 x1++;
636 }
637
638#if 0//defined(ARCH_ARM_HAVE_NEON)
639 if((x1 + 3) < x2) {
640 uint32_t len = (x2 - x1 - 3) >> 1;
641 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
642 out += len << 1;
643 x1 += len << 1;
644 }
645#endif
646
647 while(x1 < x2) {
David Grossb0abb142015-03-12 15:23:03 -0700648 OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams34b0d312013-06-26 15:34:02 -0700649 out++;
650 x1++;
651 }
652}
Jason Sams709a0972012-11-15 18:18:04 -0800653
654RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
Jason Samsc905efd2012-11-26 15:20:18 -0800655 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
656 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
Jason Sams709a0972012-11-15 18:18:04 -0800657
Jason Sams34b0d312013-06-26 15:34:02 -0700658 if (e->getType() == RS_TYPE_FLOAT_32) {
659 switch(e->getVectorSize()) {
660 case 1:
661 mRootPtr = &kernelF1;
662 break;
663 case 2:
664 mRootPtr = &kernelF2;
665 break;
666 case 3:
667 case 4:
668 mRootPtr = &kernelF4;
669 break;
670 }
671 } else {
672 switch(e->getVectorSize()) {
673 case 1:
674 mRootPtr = &kernelU1;
675 break;
676 case 2:
677 mRootPtr = &kernelU2;
678 break;
679 case 3:
680 case 4:
681 mRootPtr = &kernelU4;
682 break;
683 }
684 }
Jason Samsce0351d2013-01-25 19:44:04 -0800685 for(int ct=0; ct < 25; ct++) {
Jason Sams34b0d312013-06-26 15:34:02 -0700686 mFp[ct] = 1.f / 25.f;
687 mIp[ct] = (short)(mFp[ct] * 256.f);
Jason Sams709a0972012-11-15 18:18:04 -0800688 }
689}
690
691RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
692}
693
694void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
695 s->mHal.info.exportedVariableCount = 2;
696}
697
698void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
699 alloc.clear();
700}
701
702
Jason Samsc905efd2012-11-26 15:20:18 -0800703RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
704 const Script *s, const Element *e) {
Jason Sams709a0972012-11-15 18:18:04 -0800705
Jason Samsc905efd2012-11-26 15:20:18 -0800706 return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800707}