blob: 1a28aabccddb47cc265f5aaa3100dc4864857b9b [file] [log] [blame]
Jason Samsd85e2832012-09-11 16:04:27 -07001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Jason Sams709a0972012-11-15 18:18:04 -080017#include "rsCpuIntrinsic.h"
18#include "rsCpuIntrinsicInlines.h"
Jason Samsd85e2832012-09-11 16:04:27 -070019
20using namespace android;
21using namespace android::renderscript;
22
Jason Sams709a0972012-11-15 18:18:04 -080023namespace android {
24namespace renderscript {
25
26
27class RsdCpuScriptIntrinsicBlur : public RsdCpuScriptIntrinsic {
28public:
29 virtual void populateScript(Script *);
30 virtual void invokeFreeChildren();
31
32 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
33 virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
34
35 virtual ~RsdCpuScriptIntrinsicBlur();
Jason Samsc905efd2012-11-26 15:20:18 -080036 RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
Jason Sams709a0972012-11-15 18:18:04 -080037
38protected:
Jason Samsc44d6702012-11-28 18:37:52 -080039 float mFp[104];
40 short mIp[104];
41 void **mScratch;
42 size_t *mScratchSize;
43 float mRadius;
44 int mIradius;
45 ObjectBaseRef<Allocation> mAlloc;
Jason Sams709a0972012-11-15 18:18:04 -080046
Jason Samsc905efd2012-11-26 15:20:18 -080047 static void kernelU4(const RsForEachStubParamStruct *p,
48 uint32_t xstart, uint32_t xend,
49 uint32_t instep, uint32_t outstep);
50 static void kernelU1(const RsForEachStubParamStruct *p,
51 uint32_t xstart, uint32_t xend,
52 uint32_t instep, uint32_t outstep);
Jason Sams709a0972012-11-15 18:18:04 -080053 void ComputeGaussianWeights();
Jason Samsd85e2832012-09-11 16:04:27 -070054};
55
Jason Sams709a0972012-11-15 18:18:04 -080056}
57}
58
59
60void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() {
Jason Samsc44d6702012-11-28 18:37:52 -080061 memset(mFp, 0, sizeof(mFp));
62 memset(mIp, 0, sizeof(mIp));
Jason Sams7079cd82012-11-27 18:26:33 -080063
Jason Samsd85e2832012-09-11 16:04:27 -070064 // Compute gaussian weights for the blur
65 // e is the euler's number
66 float e = 2.718281828459045f;
67 float pi = 3.1415926535897932f;
68 // g(x) = ( 1 / sqrt( 2 * pi ) * sigma) * e ^ ( -x^2 / 2 * sigma^2 )
69 // x is of the form [-radius .. 0 .. radius]
70 // and sigma varies with radius.
71 // Based on some experimental radius values and sigma's
72 // we approximately fit sigma = f(radius) as
73 // sigma = radius * 0.4 + 0.6
74 // The larger the radius gets, the more our gaussian blur
75 // will resemble a box blur since with large sigma
76 // the gaussian curve begins to lose its shape
Jason Samsc44d6702012-11-28 18:37:52 -080077 float sigma = 0.4f * mRadius + 0.6f;
Jason Samsd85e2832012-09-11 16:04:27 -070078
79 // Now compute the coefficients. We will store some redundant values to save
80 // some math during the blur calculations precompute some values
81 float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
82 float coeff2 = - 1.0f / (2.0f * sigma * sigma);
83
84 float normalizeFactor = 0.0f;
85 float floatR = 0.0f;
86 int r;
Jason Samsc44d6702012-11-28 18:37:52 -080087 mIradius = (float)ceil(mRadius) + 0.5f;
88 for (r = -mIradius; r <= mIradius; r ++) {
Jason Samsd85e2832012-09-11 16:04:27 -070089 floatR = (float)r;
Jason Samsc44d6702012-11-28 18:37:52 -080090 mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
91 normalizeFactor += mFp[r + mIradius];
Jason Samsd85e2832012-09-11 16:04:27 -070092 }
93
94 //Now we need to normalize the weights because all our coefficients need to add up to one
95 normalizeFactor = 1.0f / normalizeFactor;
Jason Samsc44d6702012-11-28 18:37:52 -080096 for (r = -mIradius; r <= mIradius; r ++) {
97 mFp[r + mIradius] *= normalizeFactor;
98 mIp[r + mIradius] = (short)(mIp[r + mIradius] * 32768);
Jason Samsd85e2832012-09-11 16:04:27 -070099 }
100}
101
Jason Sams709a0972012-11-15 18:18:04 -0800102void RsdCpuScriptIntrinsicBlur::setGlobalObj(uint32_t slot, ObjectBase *data) {
Jason Samsd85e2832012-09-11 16:04:27 -0700103 rsAssert(slot == 1);
Jason Samsc44d6702012-11-28 18:37:52 -0800104 mAlloc.set(static_cast<Allocation *>(data));
Jason Samsd85e2832012-09-11 16:04:27 -0700105}
106
Jason Sams709a0972012-11-15 18:18:04 -0800107void RsdCpuScriptIntrinsicBlur::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
Jason Samsd85e2832012-09-11 16:04:27 -0700108 rsAssert(slot == 0);
Jason Samsc44d6702012-11-28 18:37:52 -0800109 mRadius = ((const float *)data)[0];
Jason Sams709a0972012-11-15 18:18:04 -0800110 ComputeGaussianWeights();
Jason Samsd85e2832012-09-11 16:04:27 -0700111}
112
Jason Samsd85e2832012-09-11 16:04:27 -0700113
114
Jason Samsc905efd2012-11-26 15:20:18 -0800115static void OneVU4(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
116 const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
Jason Samsd85e2832012-09-11 16:04:27 -0700117
118 const uchar *pi = ptrIn + x*4;
119
120 float4 blurredPixel = 0;
121 for (int r = -iradius; r <= iradius; r ++) {
122 int validY = rsMax((y + r), 0);
123 validY = rsMin(validY, (int)(p->dimY - 1));
124 const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
125 float4 pf = convert_float4(pvy[0]);
126 blurredPixel += pf * gPtr[0];
127 gPtr++;
128 }
129
130 out->xyzw = blurredPixel;
131}
132
Jason Samsc905efd2012-11-26 15:20:18 -0800133static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
134 const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
Jason Samse78e5142012-09-19 00:46:31 -0700135
Jason Samsc905efd2012-11-26 15:20:18 -0800136 const uchar *pi = ptrIn + x;
137
138 float blurredPixel = 0;
139 for (int r = -iradius; r <= iradius; r ++) {
140 int validY = rsMax((y + r), 0);
141 validY = rsMin(validY, (int)(p->dimY - 1));
142 float pf = (float)pi[validY * iStride];
143 blurredPixel += pf * gPtr[0];
144 gPtr++;
145 }
146
147 out[0] = blurredPixel;
148}
149
150extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
151extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
Jason Sams7079cd82012-11-27 18:26:33 -0800152extern "C" void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
Jason Samsc905efd2012-11-26 15:20:18 -0800153
154static void OneVFU4(float4 *out,
155 const uchar *ptrIn, int iStride, const float* gPtr, int ct,
156 int x1, int x2) {
Jason Samse78e5142012-09-19 00:46:31 -0700157
158#if defined(ARCH_ARM_HAVE_NEON)
Jason Sams2207ab72012-09-19 13:44:55 -0700159 {
160 int t = (x2 - x1);
161 t &= ~1;
162 if(t) {
Jason Samsc905efd2012-11-26 15:20:18 -0800163 rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
Jason Sams2207ab72012-09-19 13:44:55 -0700164 }
165 x1 += t;
166 }
Jason Samse78e5142012-09-19 00:46:31 -0700167#endif
168
169 while(x2 > x1) {
Tim Murray2e5ef662012-10-22 14:46:39 -0700170 const uchar *pi = ptrIn;
Jason Samse78e5142012-09-19 00:46:31 -0700171 float4 blurredPixel = 0;
172 const float* gp = gPtr;
173
174 for (int r = 0; r < ct; r++) {
175 float4 pf = convert_float4(((const uchar4 *)pi)[0]);
176 blurredPixel += pf * gp[0];
177 pi += iStride;
178 gp++;
179 }
180 out->xyzw = blurredPixel;
181 x1++;
182 out++;
Jason Sams7079cd82012-11-27 18:26:33 -0800183 ptrIn++;
Jason Samse78e5142012-09-19 00:46:31 -0700184 }
185}
186
Jason Samsc905efd2012-11-26 15:20:18 -0800187static void OneVFU1(float *out,
Jason Samsc44d6702012-11-28 18:37:52 -0800188 const uchar *ptrIn, int iStride, const float* gPtr, int ct, int x1, int x2) {
Jason Samsc905efd2012-11-26 15:20:18 -0800189
Jason Samsc44d6702012-11-28 18:37:52 -0800190 int len = x2 - x1;
191
192 while((x2 > x1) && (((int)ptrIn) & 0x3)) {
Jason Sams7079cd82012-11-27 18:26:33 -0800193 const uchar *pi = ptrIn;
194 float blurredPixel = 0;
195 const float* gp = gPtr;
196
197 for (int r = 0; r < ct; r++) {
198 float pf = (float)pi[0];
199 blurredPixel += pf * gp[0];
200 pi += iStride;
201 gp++;
202 }
203 out[0] = blurredPixel;
Jason Samsc44d6702012-11-28 18:37:52 -0800204 x1++;
Jason Sams7079cd82012-11-27 18:26:33 -0800205 out++;
206 ptrIn++;
207 }
208
Jason Samsc905efd2012-11-26 15:20:18 -0800209#if defined(ARCH_ARM_HAVE_NEON)
210 {
Jason Samsc44d6702012-11-28 18:37:52 -0800211 int t = (x2 - x1) >> 2;
Jason Samsc905efd2012-11-26 15:20:18 -0800212 t &= ~1;
213 if(t) {
Jason Samsce0351d2013-01-25 19:44:04 -0800214 rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
Jason Sams7079cd82012-11-27 18:26:33 -0800215 len -= t << 2;
216 ptrIn += t << 2;
217 out += t << 2;
Jason Samsc905efd2012-11-26 15:20:18 -0800218 }
Jason Samsc905efd2012-11-26 15:20:18 -0800219 }
220#endif
221
222 while(len) {
223 const uchar *pi = ptrIn;
224 float blurredPixel = 0;
225 const float* gp = gPtr;
226
227 for (int r = 0; r < ct; r++) {
228 float pf = (float)pi[0];
229 blurredPixel += pf * gp[0];
230 pi += iStride;
231 gp++;
232 }
233 out[0] = blurredPixel;
234 len--;
235 out++;
Jason Sams7079cd82012-11-27 18:26:33 -0800236 ptrIn++;
Jason Samsc905efd2012-11-26 15:20:18 -0800237 }
238}
239
240static void OneHU4(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
241 const float4 *ptrIn, const float* gPtr, int iradius) {
Jason Samsd85e2832012-09-11 16:04:27 -0700242
243 float4 blurredPixel = 0;
244 for (int r = -iradius; r <= iradius; r ++) {
245 int validX = rsMax((x + r), 0);
246 validX = rsMin(validX, (int)(p->dimX - 1));
247 float4 pf = ptrIn[validX];
248 blurredPixel += pf * gPtr[0];
249 gPtr++;
250 }
251
252 out->xyzw = convert_uchar4(blurredPixel);
253}
254
Jason Samsc905efd2012-11-26 15:20:18 -0800255static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x,
256 const float *ptrIn, const float* gPtr, int iradius) {
Jason Samsd85e2832012-09-11 16:04:27 -0700257
Jason Samsc905efd2012-11-26 15:20:18 -0800258 float blurredPixel = 0;
259 for (int r = -iradius; r <= iradius; r ++) {
260 int validX = rsMax((x + r), 0);
261 validX = rsMin(validX, (int)(p->dimX - 1));
262 float pf = ptrIn[validX];
263 blurredPixel += pf * gPtr[0];
264 gPtr++;
265 }
266
267 out[0] = (uchar)blurredPixel;
268}
269
270
271void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
272 uint32_t xstart, uint32_t xend,
273 uint32_t instep, uint32_t outstep) {
Jason Samsc44d6702012-11-28 18:37:52 -0800274
Stephen Hines2913f382013-01-14 20:44:09 -0800275 float4 stackbuf[2048];
276 float4 *buf = &stackbuf[0];
Jason Sams709a0972012-11-15 18:18:04 -0800277 RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
Jason Samsc44d6702012-11-28 18:37:52 -0800278 if (!cp->mAlloc.get()) {
Jason Samsb801b942012-10-10 12:07:38 -0700279 ALOGE("Blur executed without input, skipping");
280 return;
281 }
Jason Samsc44d6702012-11-28 18:37:52 -0800282 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
283 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
Jason Samsd85e2832012-09-11 16:04:27 -0700284
285 uchar4 *out = (uchar4 *)p->out;
286 uint32_t x1 = xstart;
287 uint32_t x2 = xend;
288
Jason Samsc44d6702012-11-28 18:37:52 -0800289 if (p->dimX > 2048) {
290 if ((p->dimX > cp->mScratchSize[p->lid]) || !cp->mScratch[p->lid]) {
291 cp->mScratch[p->lid] = realloc(cp->mScratch[p->lid], p->dimX * 16);
292 cp->mScratchSize[p->lid] = p->dimX;
293 }
Stephen Hines2913f382013-01-14 20:44:09 -0800294 buf = (float4 *)cp->mScratch[p->lid];
Jason Samsc44d6702012-11-28 18:37:52 -0800295 }
Jason Samsd85e2832012-09-11 16:04:27 -0700296 float4 *fout = (float4 *)buf;
Jason Samse78e5142012-09-19 00:46:31 -0700297 int y = p->y;
Jason Samsc44d6702012-11-28 18:37:52 -0800298 if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius))) {
299 const uchar *pi = pin + (y - cp->mIradius) * stride;
300 OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
Jason Samse78e5142012-09-19 00:46:31 -0700301 } else {
302 while(x2 > x1) {
Jason Samsc44d6702012-11-28 18:37:52 -0800303 OneVU4(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
Jason Samse78e5142012-09-19 00:46:31 -0700304 fout++;
305 x1++;
306 }
Jason Samsd85e2832012-09-11 16:04:27 -0700307 }
308
309 x1 = xstart;
Jason Samsc44d6702012-11-28 18:37:52 -0800310 while ((x1 < (uint32_t)cp->mIradius) && (x1 < x2)) {
Stephen Hines2913f382013-01-14 20:44:09 -0800311 OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
Jason Samse78e5142012-09-19 00:46:31 -0700312 out++;
313 x1++;
314 }
315#if defined(ARCH_ARM_HAVE_NEON)
Jason Samsc44d6702012-11-28 18:37:52 -0800316 if ((x1 + cp->mIradius) < x2) {
Stephen Hines2913f382013-01-14 20:44:09 -0800317 rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
Jason Samsc44d6702012-11-28 18:37:52 -0800318 cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
319 out += (x2 - cp->mIradius) - x1;
320 x1 = x2 - cp->mIradius;
Jason Samse78e5142012-09-19 00:46:31 -0700321 }
322#endif
Jason Samsd85e2832012-09-11 16:04:27 -0700323 while(x2 > x1) {
Stephen Hines2913f382013-01-14 20:44:09 -0800324 OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
Jason Samsd85e2832012-09-11 16:04:27 -0700325 out++;
326 x1++;
327 }
Jason Samsd85e2832012-09-11 16:04:27 -0700328}
329
Jason Samsc905efd2012-11-26 15:20:18 -0800330void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
331 uint32_t xstart, uint32_t xend,
332 uint32_t instep, uint32_t outstep) {
333 float buf[4 * 2048];
334 RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
Jason Samsc44d6702012-11-28 18:37:52 -0800335 if (!cp->mAlloc.get()) {
Jason Samsc905efd2012-11-26 15:20:18 -0800336 ALOGE("Blur executed without input, skipping");
337 return;
338 }
Jason Samsc44d6702012-11-28 18:37:52 -0800339 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
340 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
Jason Samsd85e2832012-09-11 16:04:27 -0700341
Jason Samsc905efd2012-11-26 15:20:18 -0800342 uchar *out = (uchar *)p->out;
343 uint32_t x1 = xstart;
344 uint32_t x2 = xend;
345
346 float *fout = (float *)buf;
347 int y = p->y;
Jason Samsce0351d2013-01-25 19:44:04 -0800348 if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius -1))) {
Jason Samsc44d6702012-11-28 18:37:52 -0800349 const uchar *pi = pin + (y - cp->mIradius) * stride;
350 OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
Jason Samsc905efd2012-11-26 15:20:18 -0800351 } else {
352 while(x2 > x1) {
Jason Samsc44d6702012-11-28 18:37:52 -0800353 OneVU1(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
Jason Samsc905efd2012-11-26 15:20:18 -0800354 fout++;
355 x1++;
356 }
357 }
358
359 x1 = xstart;
Jason Sams7079cd82012-11-27 18:26:33 -0800360 while ((x1 < x2) &&
Jason Samsc44d6702012-11-28 18:37:52 -0800361 ((x1 < (uint32_t)cp->mIradius) || (((int)out) & 0x3))) {
362 OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius);
Jason Samsc905efd2012-11-26 15:20:18 -0800363 out++;
364 x1++;
365 }
Jason Sams7079cd82012-11-27 18:26:33 -0800366#if defined(ARCH_ARM_HAVE_NEON)
Jason Samsc44d6702012-11-28 18:37:52 -0800367 if ((x1 + cp->mIradius) < x2) {
368 uint32_t len = x2 - (x1 + cp->mIradius);
369 len &= ~3;
370 rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
371 cp->mIradius * 2 + 1, x1, x1 + len);
372 out += len;
373 x1 += len;
Jason Samsc905efd2012-11-26 15:20:18 -0800374 }
375#endif
376 while(x2 > x1) {
Jason Samsc44d6702012-11-28 18:37:52 -0800377 OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius);
Jason Samsc905efd2012-11-26 15:20:18 -0800378 out++;
379 x1++;
380 }
381}
382
383RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx,
384 const Script *s, const Element *e)
385 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
386
387 mRootPtr = NULL;
388 if (e->getType() == RS_TYPE_UNSIGNED_8) {
389 switch (e->getVectorSize()) {
390 case 1:
391 mRootPtr = &kernelU1;
392 break;
393 case 4:
394 mRootPtr = &kernelU4;
395 break;
396 }
397 }
398 rsAssert(mRootPtr);
Jason Samsc44d6702012-11-28 18:37:52 -0800399 mRadius = 5;
400
401 mScratch = new void *[mCtx->getThreadCount()];
402 mScratchSize = new size_t[mCtx->getThreadCount()];
403
Jason Sams709a0972012-11-15 18:18:04 -0800404 ComputeGaussianWeights();
405}
Jason Samsd85e2832012-09-11 16:04:27 -0700406
Jason Sams709a0972012-11-15 18:18:04 -0800407RsdCpuScriptIntrinsicBlur::~RsdCpuScriptIntrinsicBlur() {
Jason Samsc44d6702012-11-28 18:37:52 -0800408 uint32_t threads = mCtx->getThreadCount();
409 if (mScratch) {
410 for (size_t i = 0; i < threads; i++) {
411 if (mScratch[i]) {
412 free(mScratch[i]);
413 }
414 }
415 delete []mScratch;
416 }
417 if (mScratchSize) {
418 delete []mScratchSize;
419 }
Jason Sams709a0972012-11-15 18:18:04 -0800420}
421
422void RsdCpuScriptIntrinsicBlur::populateScript(Script *s) {
423 s->mHal.info.exportedVariableCount = 2;
424}
425
426void RsdCpuScriptIntrinsicBlur::invokeFreeChildren() {
Jason Samsc44d6702012-11-28 18:37:52 -0800427 mAlloc.clear();
Jason Sams709a0972012-11-15 18:18:04 -0800428}
429
430
Jason Samsc905efd2012-11-26 15:20:18 -0800431RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
Jason Sams709a0972012-11-15 18:18:04 -0800432
Jason Samsc905efd2012-11-26 15:20:18 -0800433 return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
Jason Samsd85e2832012-09-11 16:04:27 -0700434}
435
436