blob: 11dda592155536426b5a2b43e4b149246d520023 [file] [log] [blame]
Jason Sams709a0972012-11-15 18:18:04 -08001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21using namespace android;
22using namespace android::renderscript;
23
24namespace android {
25namespace renderscript {
26
27
28class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
29public:
30 virtual void populateScript(Script *);
31 virtual void invokeFreeChildren();
32
33 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34 virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35
36 virtual ~RsdCpuScriptIntrinsicConvolve5x5();
Jason Samsc905efd2012-11-26 15:20:18 -080037 RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
Jason Sams709a0972012-11-15 18:18:04 -080038
39protected:
Jason Sams34b0d312013-06-26 15:34:02 -070040 float mFp[28];
41 short mIp[28];
Jason Sams709a0972012-11-15 18:18:04 -080042 ObjectBaseRef<Allocation> alloc;
43
44
Jason Sams34b0d312013-06-26 15:34:02 -070045 static void kernelU1(const RsForEachStubParamStruct *p,
46 uint32_t xstart, uint32_t xend,
47 uint32_t instep, uint32_t outstep);
48 static void kernelU2(const RsForEachStubParamStruct *p,
49 uint32_t xstart, uint32_t xend,
50 uint32_t instep, uint32_t outstep);
51 static void kernelU4(const RsForEachStubParamStruct *p,
52 uint32_t xstart, uint32_t xend,
53 uint32_t instep, uint32_t outstep);
54 static void kernelF1(const RsForEachStubParamStruct *p,
55 uint32_t xstart, uint32_t xend,
56 uint32_t instep, uint32_t outstep);
57 static void kernelF2(const RsForEachStubParamStruct *p,
58 uint32_t xstart, uint32_t xend,
59 uint32_t instep, uint32_t outstep);
60 static void kernelF4(const RsForEachStubParamStruct *p,
61 uint32_t xstart, uint32_t xend,
62 uint32_t instep, uint32_t outstep);
Jason Sams709a0972012-11-15 18:18:04 -080063
64
65};
66
67}
68}
69
70void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
71 rsAssert(slot == 1);
72 alloc.set(static_cast<Allocation *>(data));
73}
74
75void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
76 const void *data, size_t dataLength) {
77 rsAssert(slot == 0);
Jason Sams34b0d312013-06-26 15:34:02 -070078 memcpy (&mFp, data, dataLength);
Jason Sams709a0972012-11-15 18:18:04 -080079 for(int ct=0; ct < 25; ct++) {
Jason Sams34b0d312013-06-26 15:34:02 -070080 if (mFp[ct] >= 0) {
81 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
82 } else {
83 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
84 }
Jason Sams709a0972012-11-15 18:18:04 -080085 }
86}
87
88
Jason Sams34b0d312013-06-26 15:34:02 -070089static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
90 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
91 const float* coeff) {
Jason Sams709a0972012-11-15 18:18:04 -080092
93 uint32_t x0 = rsMax((int32_t)x-2, 0);
94 uint32_t x1 = rsMax((int32_t)x-1, 0);
95 uint32_t x2 = x;
96 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
97 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
98
99 float4 px = convert_float4(py0[x0]) * coeff[0] +
100 convert_float4(py0[x1]) * coeff[1] +
101 convert_float4(py0[x2]) * coeff[2] +
102 convert_float4(py0[x3]) * coeff[3] +
103 convert_float4(py0[x4]) * coeff[4] +
104
105 convert_float4(py1[x0]) * coeff[5] +
106 convert_float4(py1[x1]) * coeff[6] +
107 convert_float4(py1[x2]) * coeff[7] +
108 convert_float4(py1[x3]) * coeff[8] +
109 convert_float4(py1[x4]) * coeff[9] +
110
111 convert_float4(py2[x0]) * coeff[10] +
112 convert_float4(py2[x1]) * coeff[11] +
113 convert_float4(py2[x2]) * coeff[12] +
114 convert_float4(py2[x3]) * coeff[13] +
115 convert_float4(py2[x4]) * coeff[14] +
116
117 convert_float4(py3[x0]) * coeff[15] +
118 convert_float4(py3[x1]) * coeff[16] +
119 convert_float4(py3[x2]) * coeff[17] +
120 convert_float4(py3[x3]) * coeff[18] +
121 convert_float4(py3[x4]) * coeff[19] +
122
123 convert_float4(py4[x0]) * coeff[20] +
124 convert_float4(py4[x1]) * coeff[21] +
125 convert_float4(py4[x2]) * coeff[22] +
126 convert_float4(py4[x3]) * coeff[23] +
127 convert_float4(py4[x4]) * coeff[24];
Jason Sams709a0972012-11-15 18:18:04 -0800128 px = clamp(px, 0.f, 255.f);
Jason Sams34b0d312013-06-26 15:34:02 -0700129 *out = convert_uchar4(px);
Jason Sams709a0972012-11-15 18:18:04 -0800130}
131
Jason Sams34b0d312013-06-26 15:34:02 -0700132static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
133 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
134 const float* coeff) {
135
136 uint32_t x0 = rsMax((int32_t)x-2, 0);
137 uint32_t x1 = rsMax((int32_t)x-1, 0);
138 uint32_t x2 = x;
139 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
140 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
141
142 float2 px = convert_float2(py0[x0]) * coeff[0] +
143 convert_float2(py0[x1]) * coeff[1] +
144 convert_float2(py0[x2]) * coeff[2] +
145 convert_float2(py0[x3]) * coeff[3] +
146 convert_float2(py0[x4]) * coeff[4] +
147
148 convert_float2(py1[x0]) * coeff[5] +
149 convert_float2(py1[x1]) * coeff[6] +
150 convert_float2(py1[x2]) * coeff[7] +
151 convert_float2(py1[x3]) * coeff[8] +
152 convert_float2(py1[x4]) * coeff[9] +
153
154 convert_float2(py2[x0]) * coeff[10] +
155 convert_float2(py2[x1]) * coeff[11] +
156 convert_float2(py2[x2]) * coeff[12] +
157 convert_float2(py2[x3]) * coeff[13] +
158 convert_float2(py2[x4]) * coeff[14] +
159
160 convert_float2(py3[x0]) * coeff[15] +
161 convert_float2(py3[x1]) * coeff[16] +
162 convert_float2(py3[x2]) * coeff[17] +
163 convert_float2(py3[x3]) * coeff[18] +
164 convert_float2(py3[x4]) * coeff[19] +
165
166 convert_float2(py4[x0]) * coeff[20] +
167 convert_float2(py4[x1]) * coeff[21] +
168 convert_float2(py4[x2]) * coeff[22] +
169 convert_float2(py4[x3]) * coeff[23] +
170 convert_float2(py4[x4]) * coeff[24];
171 px = clamp(px, 0.f, 255.f);
172 *out = convert_uchar2(px);
173}
174
175static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
176 const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
177 const float* coeff) {
178
179 uint32_t x0 = rsMax((int32_t)x-2, 0);
180 uint32_t x1 = rsMax((int32_t)x-1, 0);
181 uint32_t x2 = x;
182 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
183 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
184
185 float px = (float)(py0[x0]) * coeff[0] +
186 (float)(py0[x1]) * coeff[1] +
187 (float)(py0[x2]) * coeff[2] +
188 (float)(py0[x3]) * coeff[3] +
189 (float)(py0[x4]) * coeff[4] +
190
191 (float)(py1[x0]) * coeff[5] +
192 (float)(py1[x1]) * coeff[6] +
193 (float)(py1[x2]) * coeff[7] +
194 (float)(py1[x3]) * coeff[8] +
195 (float)(py1[x4]) * coeff[9] +
196
197 (float)(py2[x0]) * coeff[10] +
198 (float)(py2[x1]) * coeff[11] +
199 (float)(py2[x2]) * coeff[12] +
200 (float)(py2[x3]) * coeff[13] +
201 (float)(py2[x4]) * coeff[14] +
202
203 (float)(py3[x0]) * coeff[15] +
204 (float)(py3[x1]) * coeff[16] +
205 (float)(py3[x2]) * coeff[17] +
206 (float)(py3[x3]) * coeff[18] +
207 (float)(py3[x4]) * coeff[19] +
208
209 (float)(py4[x0]) * coeff[20] +
210 (float)(py4[x1]) * coeff[21] +
211 (float)(py4[x2]) * coeff[22] +
212 (float)(py4[x3]) * coeff[23] +
213 (float)(py4[x4]) * coeff[24];
214 px = clamp(px, 0.f, 255.f);
215 *out = px;
216}
217
218static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
219 const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
220 const float* coeff) {
221
222 uint32_t x0 = rsMax((int32_t)x-2, 0);
223 uint32_t x1 = rsMax((int32_t)x-1, 0);
224 uint32_t x2 = x;
225 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
226 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
227
228 float4 px = py0[x0] * coeff[0] +
229 py0[x1] * coeff[1] +
230 py0[x2] * coeff[2] +
231 py0[x3] * coeff[3] +
232 py0[x4] * coeff[4] +
233
234 py1[x0] * coeff[5] +
235 py1[x1] * coeff[6] +
236 py1[x2] * coeff[7] +
237 py1[x3] * coeff[8] +
238 py1[x4] * coeff[9] +
239
240 py2[x0] * coeff[10] +
241 py2[x1] * coeff[11] +
242 py2[x2] * coeff[12] +
243 py2[x3] * coeff[13] +
244 py2[x4] * coeff[14] +
245
246 py3[x0] * coeff[15] +
247 py3[x1] * coeff[16] +
248 py3[x2] * coeff[17] +
249 py3[x3] * coeff[18] +
250 py3[x4] * coeff[19] +
251
252 py4[x0] * coeff[20] +
253 py4[x1] * coeff[21] +
254 py4[x2] * coeff[22] +
255 py4[x3] * coeff[23] +
256 py4[x4] * coeff[24];
257 *out = px;
258}
259
260static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
261 const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
262 const float* coeff) {
263
264 uint32_t x0 = rsMax((int32_t)x-2, 0);
265 uint32_t x1 = rsMax((int32_t)x-1, 0);
266 uint32_t x2 = x;
267 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
268 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
269
270 float2 px = py0[x0] * coeff[0] +
271 py0[x1] * coeff[1] +
272 py0[x2] * coeff[2] +
273 py0[x3] * coeff[3] +
274 py0[x4] * coeff[4] +
275
276 py1[x0] * coeff[5] +
277 py1[x1] * coeff[6] +
278 py1[x2] * coeff[7] +
279 py1[x3] * coeff[8] +
280 py1[x4] * coeff[9] +
281
282 py2[x0] * coeff[10] +
283 py2[x1] * coeff[11] +
284 py2[x2] * coeff[12] +
285 py2[x3] * coeff[13] +
286 py2[x4] * coeff[14] +
287
288 py3[x0] * coeff[15] +
289 py3[x1] * coeff[16] +
290 py3[x2] * coeff[17] +
291 py3[x3] * coeff[18] +
292 py3[x4] * coeff[19] +
293
294 py4[x0] * coeff[20] +
295 py4[x1] * coeff[21] +
296 py4[x2] * coeff[22] +
297 py4[x3] * coeff[23] +
298 py4[x4] * coeff[24];
299 *out = px;
300}
301
302static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
303 const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
304 const float* coeff) {
305
306 uint32_t x0 = rsMax((int32_t)x-2, 0);
307 uint32_t x1 = rsMax((int32_t)x-1, 0);
308 uint32_t x2 = x;
309 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
310 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
311
312 float px = py0[x0] * coeff[0] +
313 py0[x1] * coeff[1] +
314 py0[x2] * coeff[2] +
315 py0[x3] * coeff[3] +
316 py0[x4] * coeff[4] +
317
318 py1[x0] * coeff[5] +
319 py1[x1] * coeff[6] +
320 py1[x2] * coeff[7] +
321 py1[x3] * coeff[8] +
322 py1[x4] * coeff[9] +
323
324 py2[x0] * coeff[10] +
325 py2[x1] * coeff[11] +
326 py2[x2] * coeff[12] +
327 py2[x3] * coeff[13] +
328 py2[x4] * coeff[14] +
329
330 py3[x0] * coeff[15] +
331 py3[x1] * coeff[16] +
332 py3[x2] * coeff[17] +
333 py3[x3] * coeff[18] +
334 py3[x4] * coeff[19] +
335
336 py4[x0] * coeff[20] +
337 py4[x1] * coeff[21] +
338 py4[x2] * coeff[22] +
339 py4[x3] * coeff[23] +
340 py4[x4] * coeff[24];
341 *out = px;
342}
343
344
Jason Sams709a0972012-11-15 18:18:04 -0800345extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
346 const void *y2, const void *y3, const void *y4,
347 const short *coef, uint32_t count);
348
Jason Sams34b0d312013-06-26 15:34:02 -0700349void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p,
350 uint32_t xstart, uint32_t xend,
351 uint32_t instep, uint32_t outstep) {
Jason Sams709a0972012-11-15 18:18:04 -0800352 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
353 if (!cp->alloc.get()) {
354 ALOGE("Convolve5x5 executed without input, skipping");
355 return;
356 }
357 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
358 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
359
360 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
361 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
362 uint32_t y2 = p->y;
363 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
364 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
365
366 const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
367 const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
368 const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
369 const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
370 const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
371
372 uchar4 *out = (uchar4 *)p->out;
373 uint32_t x1 = xstart;
374 uint32_t x2 = xend;
375
376 while((x1 < x2) && (x1 < 2)) {
Jason Sams34b0d312013-06-26 15:34:02 -0700377 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams709a0972012-11-15 18:18:04 -0800378 out++;
379 x1++;
380 }
381
Jason Samsf5ef8df2013-08-06 13:49:25 -0700382#if defined(ARCH_ARM_HAVE_VFP)
383 if(gArchUseSIMD && ((x1 + 3) < x2)) {
Jason Sams709a0972012-11-15 18:18:04 -0800384 uint32_t len = (x2 - x1 - 3) >> 1;
Jason Sams34b0d312013-06-26 15:34:02 -0700385 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->mIp, len);
386 out += len << 1;
387 x1 += len << 1;
388 }
389#endif
390
391 while(x1 < x2) {
392 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
393 out++;
394 x1++;
395 }
396}
397
398void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p,
399 uint32_t xstart, uint32_t xend,
400 uint32_t instep, uint32_t outstep) {
401 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
402 if (!cp->alloc.get()) {
403 ALOGE("Convolve5x5 executed without input, skipping");
404 return;
405 }
406 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
407 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
408
409 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
410 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
411 uint32_t y2 = p->y;
412 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
413 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
414
415 const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
416 const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
417 const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
418 const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
419 const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
420
421 uchar2 *out = (uchar2 *)p->out;
422 uint32_t x1 = xstart;
423 uint32_t x2 = xend;
424
425 while((x1 < x2) && (x1 < 2)) {
426 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
427 out++;
428 x1++;
429 }
430
431#if 0//defined(ARCH_ARM_HAVE_NEON)
432 if((x1 + 3) < x2) {
433 uint32_t len = (x2 - x1 - 3) >> 1;
Jason Sams709a0972012-11-15 18:18:04 -0800434 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
435 out += len << 1;
436 x1 += len << 1;
437 }
438#endif
439
440 while(x1 < x2) {
Jason Sams34b0d312013-06-26 15:34:02 -0700441 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
Jason Sams709a0972012-11-15 18:18:04 -0800442 out++;
443 x1++;
444 }
445}
446
Jason Sams34b0d312013-06-26 15:34:02 -0700447void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p,
448 uint32_t xstart, uint32_t xend,
449 uint32_t instep, uint32_t outstep) {
450 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
451 if (!cp->alloc.get()) {
452 ALOGE("Convolve5x5 executed without input, skipping");
453 return;
454 }
455 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
456 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
457
458 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
459 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
460 uint32_t y2 = p->y;
461 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
462 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
463
464 const uchar *py0 = (const uchar *)(pin + stride * y0);
465 const uchar *py1 = (const uchar *)(pin + stride * y1);
466 const uchar *py2 = (const uchar *)(pin + stride * y2);
467 const uchar *py3 = (const uchar *)(pin + stride * y3);
468 const uchar *py4 = (const uchar *)(pin + stride * y4);
469
470 uchar *out = (uchar *)p->out;
471 uint32_t x1 = xstart;
472 uint32_t x2 = xend;
473
474 while((x1 < x2) && (x1 < 2)) {
475 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
476 out++;
477 x1++;
478 }
479
480#if 0//defined(ARCH_ARM_HAVE_NEON)
481 if((x1 + 3) < x2) {
482 uint32_t len = (x2 - x1 - 3) >> 1;
483 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
484 out += len << 1;
485 x1 += len << 1;
486 }
487#endif
488
489 while(x1 < x2) {
490 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
491 out++;
492 x1++;
493 }
494}
495
496void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p,
497 uint32_t xstart, uint32_t xend,
498 uint32_t instep, uint32_t outstep) {
499 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
500 if (!cp->alloc.get()) {
501 ALOGE("Convolve5x5 executed without input, skipping");
502 return;
503 }
504 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
505 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
506
507 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
508 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
509 uint32_t y2 = p->y;
510 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
511 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
512
513 const float4 *py0 = (const float4 *)(pin + stride * y0);
514 const float4 *py1 = (const float4 *)(pin + stride * y1);
515 const float4 *py2 = (const float4 *)(pin + stride * y2);
516 const float4 *py3 = (const float4 *)(pin + stride * y3);
517 const float4 *py4 = (const float4 *)(pin + stride * y4);
518
519 float4 *out = (float4 *)p->out;
520 uint32_t x1 = xstart;
521 uint32_t x2 = xend;
522
523 while((x1 < x2) && (x1 < 2)) {
524 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
525 out++;
526 x1++;
527 }
528
529#if 0//defined(ARCH_ARM_HAVE_NEON)
530 if((x1 + 3) < x2) {
531 uint32_t len = (x2 - x1 - 3) >> 1;
532 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
533 out += len << 1;
534 x1 += len << 1;
535 }
536#endif
537
538 while(x1 < x2) {
539 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
540 out++;
541 x1++;
542 }
543}
544
545void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p,
546 uint32_t xstart, uint32_t xend,
547 uint32_t instep, uint32_t outstep) {
548 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
549 if (!cp->alloc.get()) {
550 ALOGE("Convolve5x5 executed without input, skipping");
551 return;
552 }
553 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
554 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
555
556 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
557 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
558 uint32_t y2 = p->y;
559 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
560 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
561
562 const float2 *py0 = (const float2 *)(pin + stride * y0);
563 const float2 *py1 = (const float2 *)(pin + stride * y1);
564 const float2 *py2 = (const float2 *)(pin + stride * y2);
565 const float2 *py3 = (const float2 *)(pin + stride * y3);
566 const float2 *py4 = (const float2 *)(pin + stride * y4);
567
568 float2 *out = (float2 *)p->out;
569 uint32_t x1 = xstart;
570 uint32_t x2 = xend;
571
572 while((x1 < x2) && (x1 < 2)) {
573 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
574 out++;
575 x1++;
576 }
577
578#if 0//defined(ARCH_ARM_HAVE_NEON)
579 if((x1 + 3) < x2) {
580 uint32_t len = (x2 - x1 - 3) >> 1;
581 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
582 out += len << 1;
583 x1 += len << 1;
584 }
585#endif
586
587 while(x1 < x2) {
588 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
589 out++;
590 x1++;
591 }
592}
593
594void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p,
595 uint32_t xstart, uint32_t xend,
596 uint32_t instep, uint32_t outstep) {
597 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
598 if (!cp->alloc.get()) {
599 ALOGE("Convolve5x5 executed without input, skipping");
600 return;
601 }
602 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
603 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
604
605 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
606 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
607 uint32_t y2 = p->y;
608 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
609 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
610
611 const float *py0 = (const float *)(pin + stride * y0);
612 const float *py1 = (const float *)(pin + stride * y1);
613 const float *py2 = (const float *)(pin + stride * y2);
614 const float *py3 = (const float *)(pin + stride * y3);
615 const float *py4 = (const float *)(pin + stride * y4);
616
617 float *out = (float *)p->out;
618 uint32_t x1 = xstart;
619 uint32_t x2 = xend;
620
621 while((x1 < x2) && (x1 < 2)) {
622 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
623 out++;
624 x1++;
625 }
626
627#if 0//defined(ARCH_ARM_HAVE_NEON)
628 if((x1 + 3) < x2) {
629 uint32_t len = (x2 - x1 - 3) >> 1;
630 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
631 out += len << 1;
632 x1 += len << 1;
633 }
634#endif
635
636 while(x1 < x2) {
637 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
638 out++;
639 x1++;
640 }
641}
Jason Sams709a0972012-11-15 18:18:04 -0800642
643RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
Jason Samsc905efd2012-11-26 15:20:18 -0800644 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
645 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
Jason Sams709a0972012-11-15 18:18:04 -0800646
Jason Sams34b0d312013-06-26 15:34:02 -0700647 if (e->getType() == RS_TYPE_FLOAT_32) {
648 switch(e->getVectorSize()) {
649 case 1:
650 mRootPtr = &kernelF1;
651 break;
652 case 2:
653 mRootPtr = &kernelF2;
654 break;
655 case 3:
656 case 4:
657 mRootPtr = &kernelF4;
658 break;
659 }
660 } else {
661 switch(e->getVectorSize()) {
662 case 1:
663 mRootPtr = &kernelU1;
664 break;
665 case 2:
666 mRootPtr = &kernelU2;
667 break;
668 case 3:
669 case 4:
670 mRootPtr = &kernelU4;
671 break;
672 }
673 }
Jason Samsce0351d2013-01-25 19:44:04 -0800674 for(int ct=0; ct < 25; ct++) {
Jason Sams34b0d312013-06-26 15:34:02 -0700675 mFp[ct] = 1.f / 25.f;
676 mIp[ct] = (short)(mFp[ct] * 256.f);
Jason Sams709a0972012-11-15 18:18:04 -0800677 }
678}
679
680RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
681}
682
683void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
684 s->mHal.info.exportedVariableCount = 2;
685}
686
687void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
688 alloc.clear();
689}
690
691
Jason Samsc905efd2012-11-26 15:20:18 -0800692RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
693 const Script *s, const Element *e) {
Jason Sams709a0972012-11-15 18:18:04 -0800694
Jason Samsc905efd2012-11-26 15:20:18 -0800695 return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800696}
697
698
699