blob: f072e521d13c3303d62f73902d3b6bca9c7cfaa2 [file] [log] [blame]
Jason Sams709a0972012-11-15 18:18:04 -08001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Jason Sams9b2b9ef2013-07-29 17:38:00 -070017#include <sys/mman.h>
18#include <unistd.h>
Jason Sams709a0972012-11-15 18:18:04 -080019
20#include "rsCpuIntrinsic.h"
21#include "rsCpuIntrinsicInlines.h"
Jason Sams9b2b9ef2013-07-29 17:38:00 -070022#include "linkloader/include/MemChunk.h"
Narayan Kamath72f5f8c2014-03-11 12:23:29 +000023#include "linkloader/utils/flush_cpu_cache.h"
Jason Sams9b2b9ef2013-07-29 17:38:00 -070024
25#include <sys/mman.h>
26#include <stddef.h>
27#include <stdint.h>
28#include <stdlib.h>
29//#include <utils/StopWatch.h>
30
Jason Sams709a0972012-11-15 18:18:04 -080031
Jason Samsa65de102013-08-09 13:42:28 -070032/* uint kernel
33 * Q0 D0: Load slot for R
34 * D1: Load slot for G
35 * Q1 D2: Load slot for B
36 * D3: Load slot for A
37 * Q2 D4: Matrix
38 * D5: =
39 * Q3 D6: =
40 * D7: =
41 * Q4 D8: Add R
42 * D9:
43 * Q5 D10: Add G
44 * D11:
45 * Q6 D12: Add B
46 * D13:
47 * Q7 D14: Add A
48 * D15:
49 * Q8 D16: I32: R Sum
50 * D17:
51 * Q9 D18: I32: G Sum
52 * D19:
53 * Q10 D20: I32: B Sum
54 * D21:
55 * Q11 D22: I32: A Sum
56 * D23:
57 * Q12 D24: U16: expanded R
58 * D25:
59 * Q13 D26: U16: expanded G
60 * D27:
61 * Q14 D28: U16: expanded B
62 * D29:
63 * Q15 D30: U16: expanded A
64 * D31:
65 *
66 */
67
68/* float kernel
69 * Q0 D0: Load slot for R
70 * D1: =
71 * Q1 D2: Load slot for G
72 * D3: =
73 * Q2 D4: Load slot for B
74 * D5: =
75 * Q3 D6: Load slot for A
76 * D7: =
77 * Q4 D8: Matrix
78 * D9: =
79 * Q5 D10: =
80 * D11: =
81 * Q6 D12: =
82 * D13: =
83 * Q7 D14: =
84 * D15: =
85 * Q8 D16: Add R
86 * D17: =
87 * Q9 D18: Add G
88 * D19: =
89 * Q10 D20: Add B
90 * D21: =
91 * Q11 D22: Add A
92 * D23: =
93 * Q12 D24: Sum R
94 * D25: =
95 * Q13 D26: Sum G
96 * D27: =
97 * Q14 D28: Sum B
98 * D29: =
99 * Q15 D30: Sum A
100 * D31: =
101 *
102 */
103
104
105
Jason Sams709a0972012-11-15 18:18:04 -0800106using namespace android;
107using namespace android::renderscript;
108
109namespace android {
110namespace renderscript {
111
Jason Samsa65de102013-08-09 13:42:28 -0700112typedef union {
113 uint64_t key;
114 struct {
115 uint32_t inVecSize :2; // [0 - 1]
116 uint32_t outVecSize :2; // [2 - 3]
117 uint32_t inType :4; // [4 - 7]
118 uint32_t outType :4; // [8 - 11]
119 uint32_t dot :1; // [12]
120 uint32_t _unused1 :1; // [13]
121 uint32_t copyAlpha :1; // [14]
122 uint32_t _unused2 :1; // [15]
123 uint32_t coeffMask :16; // [16-31]
124 uint32_t addMask :4; // [32-35]
125 } u;
126} Key_t;
Jason Sams709a0972012-11-15 18:18:04 -0800127
Tim Murray6a45ddb2014-08-06 11:49:02 -0700128//Re-enable when intrinsic is fixed
129#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
Simon Hosie0462a392014-03-07 19:36:44 -0800130typedef struct {
131 void (*column[4])(void);
132 void (*store)(void);
133 void (*load)(void);
Simon Hosie6e7e2582014-05-06 01:07:21 -0700134 void (*store_end)(void);
135 void (*load_end)(void);
Simon Hosie0462a392014-03-07 19:36:44 -0800136} FunctionTab_t;
137
Simon Hosie6e7e2582014-05-06 01:07:21 -0700138extern "C" void rsdIntrinsicColorMatrix_int_K(
Simon Hosie0462a392014-03-07 19:36:44 -0800139 void *out, void const *in, size_t count,
140 FunctionTab_t const *fns,
141 int16_t const *mult, int32_t const *add);
142
Simon Hosie6e7e2582014-05-06 01:07:21 -0700143extern "C" void rsdIntrinsicColorMatrix_float_K(
Simon Hosie0462a392014-03-07 19:36:44 -0800144 void *out, void const *in, size_t count,
145 FunctionTab_t const *fns,
146 float const *mult, float const *add);
147
Simon Hosie6e7e2582014-05-06 01:07:21 -0700148/* The setup functions fill in function tables to be used by above functions;
149 * this code also eliminates jump-to-another-jump cases by short-circuiting
150 * empty functions. While it's not performance critical, it works out easier
151 * to write the set-up code in assembly than to try to expose the same symbols
152 * and write the code in C.
153 */
154extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
155 FunctionTab_t *fns,
156 uint32_t mask, int dt, int st);
157
Simon Hosie0462a392014-03-07 19:36:44 -0800158extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
Simon Hosie6e7e2582014-05-06 01:07:21 -0700159 FunctionTab_t *fns,
Simon Hosie0462a392014-03-07 19:36:44 -0800160 uint32_t mask, int dt, int st);
161#endif
162
Jason Sams709a0972012-11-15 18:18:04 -0800163class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
164public:
165 virtual void populateScript(Script *);
166
167 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
168
169 virtual ~RsdCpuScriptIntrinsicColorMatrix();
Jason Samsc905efd2012-11-26 15:20:18 -0800170 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
Jason Sams709a0972012-11-15 18:18:04 -0800171
Chris Wailesf3712132014-07-16 15:18:30 -0700172 virtual void preLaunch(uint32_t slot, const Allocation ** ains,
173 uint32_t inLen, Allocation * aout, const void * usr,
174 uint32_t usrLen, const RsScriptCall *sc);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700175
Jason Sams709a0972012-11-15 18:18:04 -0800176protected:
177 float fp[16];
Jason Sams2b0d8e62013-08-29 16:41:01 -0700178 float fpa[4];
Jason Samsa65de102013-08-09 13:42:28 -0700179
Jason Sams2b0d8e62013-08-29 16:41:01 -0700180 // The following four fields are read as constants
181 // by the SIMD assembly code.
Jason Sams709a0972012-11-15 18:18:04 -0800182 short ip[16];
Simon Hosie0462a392014-03-07 19:36:44 -0800183 int ipa[4];
Jason Sams2b0d8e62013-08-29 16:41:01 -0700184 float tmpFp[16];
Simon Hosie0462a392014-03-07 19:36:44 -0800185 float tmpFpa[4];
Tim Murray6a45ddb2014-08-06 11:49:02 -0700186#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
Simon Hosie0462a392014-03-07 19:36:44 -0800187 FunctionTab_t mFnTab;
188#endif
Jason Sams709a0972012-11-15 18:18:04 -0800189
Chris Wailes80ef6932014-07-08 11:22:18 -0700190 static void kernel(const RsExpandKernelParams *p,
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700191 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700192 uint32_t outstep);
Jason Sams9e4a96a2013-09-11 15:52:22 -0700193 void updateCoeffCache(float fpMul, float addMul);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700194
Jason Samsa65de102013-08-09 13:42:28 -0700195 Key_t mLastKey;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700196 unsigned char *mBuf;
197 size_t mBufSize;
198
Jason Samsa65de102013-08-09 13:42:28 -0700199 Key_t computeKey(const Element *ein, const Element *eout);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700200
Jason Samsa65de102013-08-09 13:42:28 -0700201 bool build(Key_t key);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700202
203 void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
204
Jason Sams709a0972012-11-15 18:18:04 -0800205};
206
207}
208}
209
210
Jason Samsa65de102013-08-09 13:42:28 -0700211Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700212 const Element *ein, const Element *eout) {
213
Jason Samsa65de102013-08-09 13:42:28 -0700214 Key_t key;
215 key.key = 0;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700216
217 // Compute a unique code key for this operation
218
219 // Add to the key the input and output types
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700220 bool hasFloat = false;
221 if (ein->getType() == RS_TYPE_FLOAT_32) {
222 hasFloat = true;
Jason Samsa65de102013-08-09 13:42:28 -0700223 key.u.inType = RS_TYPE_FLOAT_32;
224 rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700225 }
226 if (eout->getType() == RS_TYPE_FLOAT_32) {
227 hasFloat = true;
Jason Samsa65de102013-08-09 13:42:28 -0700228 key.u.outType = RS_TYPE_FLOAT_32;
229 rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
Jason Sams709a0972012-11-15 18:18:04 -0800230 }
231
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700232 // Mask in the bits indicating which coefficients in the
233 // color matrix are needed.
234 if (hasFloat) {
235 for (uint32_t i=0; i < 16; i++) {
236 if (fabs(fp[i]) != 0.f) {
Jason Samsa65de102013-08-09 13:42:28 -0700237 key.u.coeffMask |= 1 << i;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700238 }
239 }
Jason Samsa65de102013-08-09 13:42:28 -0700240 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700241 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
242 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
243 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
Jason Samsa65de102013-08-09 13:42:28 -0700244
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700245 } else {
246 for (uint32_t i=0; i < 16; i++) {
247 if (ip[i] != 0) {
Jason Samsa65de102013-08-09 13:42:28 -0700248 key.u.coeffMask |= 1 << i;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700249 }
Jason Sams709a0972012-11-15 18:18:04 -0800250 }
Jason Samsa65de102013-08-09 13:42:28 -0700251 if (ipa[0] != 0) key.u.addMask |= 0x1;
Simon Hosie0462a392014-03-07 19:36:44 -0800252 if (ipa[1] != 0) key.u.addMask |= 0x2;
253 if (ipa[2] != 0) key.u.addMask |= 0x4;
254 if (ipa[3] != 0) key.u.addMask |= 0x8;
Jason Sams709a0972012-11-15 18:18:04 -0800255 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700256
257 // Look for a dot product where the r,g,b colums are the same
258 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
259 (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
260 (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
261 (ip[12] == ip[13]) && (ip[12] == ip[14])) {
262
Jason Samsa65de102013-08-09 13:42:28 -0700263 if (!key.u.addMask) key.u.dot = 1;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700264 }
265
266 // Is alpha a simple copy
Jason Samsa65de102013-08-09 13:42:28 -0700267 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700268 key.u.copyAlpha = !(key.u.inType || key.u.outType);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700269 }
270
Jason Samsa65de102013-08-09 13:42:28 -0700271 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
272
273 switch (ein->getVectorSize()) {
274 case 4:
275 key.u.inVecSize = 3;
276 break;
277 case 3:
278 key.u.inVecSize = 2;
279 key.u.coeffMask &= ~0xF000;
280 break;
281 case 2:
282 key.u.inVecSize = 1;
283 key.u.coeffMask &= ~0xFF00;
284 break;
285 default:
286 key.u.coeffMask &= ~0xFFF0;
287 break;
288 }
289
290 switch (eout->getVectorSize()) {
291 case 4:
292 key.u.outVecSize = 3;
293 break;
294 case 3:
295 key.u.outVecSize = 2;
296 key.u.coeffMask &= ~0x8888;
Simon Hosie0462a392014-03-07 19:36:44 -0800297 key.u.addMask &= 7;
Jason Samsa65de102013-08-09 13:42:28 -0700298 break;
299 case 2:
300 key.u.outVecSize = 1;
301 key.u.coeffMask &= ~0xCCCC;
Simon Hosie0462a392014-03-07 19:36:44 -0800302 key.u.addMask &= 3;
Jason Samsa65de102013-08-09 13:42:28 -0700303 break;
304 default:
305 key.u.coeffMask &= ~0xEEEE;
Simon Hosie0462a392014-03-07 19:36:44 -0800306 key.u.addMask &= 1;
Jason Samsa65de102013-08-09 13:42:28 -0700307 break;
308 }
309
Jason Sams9e4a96a2013-09-11 15:52:22 -0700310 if (key.u.inType && !key.u.outType) {
311 key.u.addMask |= 1;
312 if (key.u.outVecSize > 0) key.u.addMask |= 2;
313 if (key.u.outVecSize > 1) key.u.addMask |= 4;
314 if (key.u.outVecSize > 2) key.u.addMask |= 8;
315 }
316
Jason Samsa65de102013-08-09 13:42:28 -0700317 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700318 return key;
Jason Sams709a0972012-11-15 18:18:04 -0800319}
320
Jason Sams074424a2014-05-22 13:30:03 -0700321#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700322
323#define DEF_SYM(x) \
324 extern "C" uint32_t _N_ColorMatrix_##x; \
325 extern "C" uint32_t _N_ColorMatrix_##x##_end; \
326 extern "C" uint32_t _N_ColorMatrix_##x##_len;
327
Jason Samsa65de102013-08-09 13:42:28 -0700328DEF_SYM(prefix_i)
329DEF_SYM(prefix_f)
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700330DEF_SYM(postfix1)
331DEF_SYM(postfix2)
Jason Samsa65de102013-08-09 13:42:28 -0700332
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700333DEF_SYM(load_u8_4)
Jason Sams9e4a96a2013-09-11 15:52:22 -0700334DEF_SYM(load_u8_3)
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700335DEF_SYM(load_u8_2)
336DEF_SYM(load_u8_1)
Jason Samsa65de102013-08-09 13:42:28 -0700337DEF_SYM(load_u8f_4)
Jason Sams9e4a96a2013-09-11 15:52:22 -0700338DEF_SYM(load_u8f_3)
Jason Samsa65de102013-08-09 13:42:28 -0700339DEF_SYM(load_u8f_2)
340DEF_SYM(load_u8f_1)
341DEF_SYM(load_f32_4)
Jason Sams9e4a96a2013-09-11 15:52:22 -0700342DEF_SYM(load_f32_3)
Jason Samsa65de102013-08-09 13:42:28 -0700343DEF_SYM(load_f32_2)
344DEF_SYM(load_f32_1)
345
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700346DEF_SYM(store_u8_4)
347DEF_SYM(store_u8_2)
348DEF_SYM(store_u8_1)
Jason Samsa65de102013-08-09 13:42:28 -0700349DEF_SYM(store_f32_4)
Jason Sams9e4a96a2013-09-11 15:52:22 -0700350DEF_SYM(store_f32_3)
Jason Samsa65de102013-08-09 13:42:28 -0700351DEF_SYM(store_f32_2)
352DEF_SYM(store_f32_1)
Jason Sams2b0d8e62013-08-29 16:41:01 -0700353DEF_SYM(store_f32u_4)
354DEF_SYM(store_f32u_2)
355DEF_SYM(store_f32u_1)
356
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700357DEF_SYM(unpack_u8_4)
358DEF_SYM(unpack_u8_3)
359DEF_SYM(unpack_u8_2)
360DEF_SYM(unpack_u8_1)
361DEF_SYM(pack_u8_4)
362DEF_SYM(pack_u8_3)
363DEF_SYM(pack_u8_2)
364DEF_SYM(pack_u8_1)
365DEF_SYM(dot)
366DEF_SYM(add_0_u8)
367DEF_SYM(add_1_u8)
368DEF_SYM(add_2_u8)
369DEF_SYM(add_3_u8)
370
371#define ADD_CHUNK(x) \
372 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
373 buf += _N_ColorMatrix_##x##_len
374
375
376static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
377 size_t off = (target - buf - 8) >> 2;
378 rsAssert(((off & 0xff000000) == 0) ||
379 ((off & 0xff000000) == 0xff000000));
380
381 uint32_t op = (condition << 28);
382 op |= 0xa << 24; // branch
383 op |= 0xffffff & off;
384 ((uint32_t *)buf)[0] = op;
385 return buf + 4;
386}
387
Jason Sams2b0d8e62013-08-29 16:41:01 -0700388static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700389 rsAssert(vd < 32);
390 rsAssert(vm < 32);
391 rsAssert(vn < 32);
392
393 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
394 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
395 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700396 return op;
397}
398
399static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
400 //vmlal.s16 Q#1, D#1, D#2[#]
Jason Sams2b0d8e62013-08-29 16:41:01 -0700401 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700402 ((uint32_t *)buf)[0] = op;
403 return buf + 4;
404}
405
406static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
407 //vmull.s16 Q#1, D#1, D#2[#]
Jason Sams2b0d8e62013-08-29 16:41:01 -0700408 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700409 ((uint32_t *)buf)[0] = op;
410 return buf + 4;
411}
Jason Samsa65de102013-08-09 13:42:28 -0700412
413static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
Simon Hosiec7c255e2014-03-07 16:23:12 -0800414 //vqadd.s32 Q#1, Q#1, Q#2
Jason Sams2b0d8e62013-08-29 16:41:01 -0700415 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
Jason Samsa65de102013-08-09 13:42:28 -0700416 ((uint32_t *)buf)[0] = op;
417 return buf + 4;
418}
419
420static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
421 //vmlal.f32 Q#1, D#1, D#2[#]
Jason Sams2b0d8e62013-08-29 16:41:01 -0700422 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
Jason Samsa65de102013-08-09 13:42:28 -0700423 ((uint32_t *)buf)[0] = op;
424 return buf + 4;
425}
426
427static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
428 //vmull.f32 Q#1, D#1, D#2[#]
Jason Sams2b0d8e62013-08-29 16:41:01 -0700429 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
430 ((uint32_t *)buf)[0] = op;
431 return buf + 4;
432}
433
434static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
435 //vadd.f32 Q#1, D#1, D#2
436 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
Jason Samsa65de102013-08-09 13:42:28 -0700437 ((uint32_t *)buf)[0] = op;
438 return buf + 4;
439}
440
Simon Hosiec7c255e2014-03-07 16:23:12 -0800441static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
442 //vmov.32 Q#1, #imm
443 rsAssert(imm == 0);
444 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
445 ((uint32_t *)buf)[0] = op;
446 return buf + 4;
447}
448
Jason Samsa65de102013-08-09 13:42:28 -0700449static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
450 //vadd.f32 Q#1, D#1, D#2
Jason Sams2b0d8e62013-08-29 16:41:01 -0700451 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
Jason Samsa65de102013-08-09 13:42:28 -0700452 ((uint32_t *)buf)[0] = op;
453 return buf + 4;
454}
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700455#endif
456
Rose, James7b7060c2014-04-22 12:08:06 +0800457#if defined(ARCH_X86_HAVE_SSSE3)
458extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
459 const short *coef, uint32_t count);
460extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
461 const short *coef, uint32_t count);
462extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
463 const short *coef, uint32_t count);
464
465void * selectKernel(Key_t key)
466{
467 void * kernel = NULL;
468
469 // inType, outType float if nonzero
470 if (!(key.u.inType || key.u.outType)) {
471 if (key.u.dot)
472 kernel = (void *)rsdIntrinsicColorMatrixDot_K;
473 else if (key.u.copyAlpha)
474 kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
475 else
476 kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
477 }
478
479 return kernel;
480}
481#endif
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700482
Jason Samsa65de102013-08-09 13:42:28 -0700483bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
Jason Sams074424a2014-05-22 13:30:03 -0700484#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700485 mBufSize = 4096;
486 //StopWatch build_time("rs cm: build time");
487 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
488 MAP_PRIVATE | MAP_ANON, -1, 0);
489 if (!mBuf) {
490 return false;
491 }
492
493 uint8_t *buf = mBuf;
Jason Samsa65de102013-08-09 13:42:28 -0700494 uint8_t *buf2 = NULL;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700495
Jason Samsa65de102013-08-09 13:42:28 -0700496 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
497 int opInit[4] = {0, 0, 0, 0};
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700498
Jason Samsa65de102013-08-09 13:42:28 -0700499 memset(ops, 0, sizeof(ops));
500 for (int i=0; i < 4; i++) {
501 if (key.u.coeffMask & (1 << (i*4))) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700502 ops[i][0] = 0x2 | opInit[0];
Jason Samsa65de102013-08-09 13:42:28 -0700503 opInit[0] = 1;
504 }
505 if (!key.u.dot) {
506 if (key.u.coeffMask & (1 << (1 + i*4))) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700507 ops[i][1] = 0x2 | opInit[1];
Jason Samsa65de102013-08-09 13:42:28 -0700508 opInit[1] = 1;
509 }
510 if (key.u.coeffMask & (1 << (2 + i*4))) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700511 ops[i][2] = 0x2 | opInit[2];
Jason Samsa65de102013-08-09 13:42:28 -0700512 opInit[2] = 1;
513 }
514 }
515 if (!key.u.copyAlpha) {
516 if (key.u.coeffMask & (1 << (3 + i*4))) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700517 ops[i][3] = 0x2 | opInit[3];
Jason Samsa65de102013-08-09 13:42:28 -0700518 opInit[3] = 1;
519 }
520 }
521 }
Jason Samsa65de102013-08-09 13:42:28 -0700522
Jason Samsa65de102013-08-09 13:42:28 -0700523 if (key.u.inType || key.u.outType) {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700524 key.u.copyAlpha = 0;
Jason Samsa65de102013-08-09 13:42:28 -0700525 ADD_CHUNK(prefix_f);
526 buf2 = buf;
527
528 // Load the incoming r,g,b,a as needed
529 if (key.u.inType) {
530 switch(key.u.inVecSize) {
531 case 3:
Jason Samsa65de102013-08-09 13:42:28 -0700532 ADD_CHUNK(load_f32_4);
533 break;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700534 case 2:
535 ADD_CHUNK(load_f32_3);
536 break;
Jason Samsa65de102013-08-09 13:42:28 -0700537 case 1:
538 ADD_CHUNK(load_f32_2);
539 break;
540 case 0:
541 ADD_CHUNK(load_f32_1);
542 break;
543 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700544 } else {
Jason Samsa65de102013-08-09 13:42:28 -0700545 switch(key.u.inVecSize) {
546 case 3:
Jason Samsa65de102013-08-09 13:42:28 -0700547 ADD_CHUNK(load_u8f_4);
548 break;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700549 case 2:
550 ADD_CHUNK(load_u8f_3);
551 break;
Jason Samsa65de102013-08-09 13:42:28 -0700552 case 1:
553 ADD_CHUNK(load_u8f_2);
554 break;
555 case 0:
556 ADD_CHUNK(load_u8f_1);
557 break;
558 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700559 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700560
Jason Samsa65de102013-08-09 13:42:28 -0700561 for (int i=0; i < 4; i++) {
562 for (int j=0; j < 4; j++) {
Jason Samsa65de102013-08-09 13:42:28 -0700563 switch(ops[i][j]) {
564 case 0:
565 break;
Jason Samsa65de102013-08-09 13:42:28 -0700566 case 2:
Jason Sams2b0d8e62013-08-29 16:41:01 -0700567 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
568 break;
569 case 3:
570 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
Jason Samsa65de102013-08-09 13:42:28 -0700571 break;
572 }
573 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700574 }
Jason Samsa65de102013-08-09 13:42:28 -0700575 for (int j=0; j < 4; j++) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700576 if (opInit[j]) {
577 if (key.u.addMask & (1 << j)) {
578 buf = addVADD_F32(buf, j, 12+j, 8+j);
579 } else {
580 buf = addVORR_32(buf, j, 12+j, 12+j);
581 }
582 } else {
583 if (key.u.addMask & (1 << j)) {
Simon Hosiec7c255e2014-03-07 16:23:12 -0800584 buf = addVORR_32(buf, j, 8+j, 8+j);
585 } else {
586 buf = addVMOV_32(buf, j, 0);
Jason Sams2b0d8e62013-08-29 16:41:01 -0700587 }
588 }
589 }
590
591 if (key.u.outType) {
592 switch(key.u.outVecSize) {
593 case 3:
Jason Sams2b0d8e62013-08-29 16:41:01 -0700594 ADD_CHUNK(store_f32_4);
595 break;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700596 case 2:
597 ADD_CHUNK(store_f32_3);
598 break;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700599 case 1:
600 ADD_CHUNK(store_f32_2);
601 break;
602 case 0:
603 ADD_CHUNK(store_f32_1);
604 break;
605 }
606 } else {
607 switch(key.u.outVecSize) {
608 case 3:
609 case 2:
610 ADD_CHUNK(store_f32u_4);
611 break;
612 case 1:
613 ADD_CHUNK(store_f32u_2);
614 break;
615 case 0:
616 ADD_CHUNK(store_f32u_1);
617 break;
Jason Samsa65de102013-08-09 13:42:28 -0700618 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700619 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700620
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700621
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700622 } else {
Jason Samsa65de102013-08-09 13:42:28 -0700623 // Add the function prefix
624 // Store the address for the loop return
625 ADD_CHUNK(prefix_i);
626 buf2 = buf;
627
628 // Load the incoming r,g,b,a as needed
629 switch(key.u.inVecSize) {
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700630 case 3:
Jason Samsa65de102013-08-09 13:42:28 -0700631 ADD_CHUNK(load_u8_4);
632 if (key.u.copyAlpha) {
633 ADD_CHUNK(unpack_u8_3);
634 } else {
635 ADD_CHUNK(unpack_u8_4);
636 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700637 break;
638 case 2:
Jason Sams9e4a96a2013-09-11 15:52:22 -0700639 ADD_CHUNK(load_u8_3);
Jason Samsa65de102013-08-09 13:42:28 -0700640 ADD_CHUNK(unpack_u8_3);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700641 break;
642 case 1:
Jason Samsa65de102013-08-09 13:42:28 -0700643 ADD_CHUNK(load_u8_2);
644 ADD_CHUNK(unpack_u8_2);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700645 break;
646 case 0:
Jason Samsa65de102013-08-09 13:42:28 -0700647 ADD_CHUNK(load_u8_1);
648 ADD_CHUNK(unpack_u8_1);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700649 break;
650 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700651
Jason Samsa65de102013-08-09 13:42:28 -0700652 // Add multiply and accumulate
653 // use MULL to init the output register,
654 // use MLAL from there
655 for (int i=0; i < 4; i++) {
656 for (int j=0; j < 4; j++) {
Jason Samsa65de102013-08-09 13:42:28 -0700657 switch(ops[i][j]) {
658 case 0:
659 break;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700660 case 2:
Jason Samsa65de102013-08-09 13:42:28 -0700661 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
662 break;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700663 case 3:
Jason Samsa65de102013-08-09 13:42:28 -0700664 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
665 break;
666 }
667 }
668 }
669 for (int j=0; j < 4; j++) {
Jason Samsec3cd2d2013-09-11 18:08:47 -0700670 if (opInit[j]) {
671 if (key.u.addMask & (1 << j)) {
672 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
673 }
674 } else {
675 if (key.u.addMask & (1 << j)) {
Simon Hosiec7c255e2014-03-07 16:23:12 -0800676 buf = addVORR_32(buf, 8+j, 4+j, 4+j);
Jason Samsec3cd2d2013-09-11 18:08:47 -0700677 }
Jason Samsa65de102013-08-09 13:42:28 -0700678 }
679 }
680
681 // If we have a dot product, perform the special pack.
682 if (key.u.dot) {
683 ADD_CHUNK(pack_u8_1);
684 ADD_CHUNK(dot);
685 } else {
686 switch(key.u.outVecSize) {
687 case 3:
Jason Sams17e3cdc2013-09-09 17:32:16 -0700688 if (key.u.copyAlpha) {
689 ADD_CHUNK(pack_u8_3);
690 } else {
691 ADD_CHUNK(pack_u8_4);
692 }
Jason Samsa65de102013-08-09 13:42:28 -0700693 break;
694 case 2:
695 ADD_CHUNK(pack_u8_3);
696 break;
697 case 1:
698 ADD_CHUNK(pack_u8_2);
699 break;
700 case 0:
701 ADD_CHUNK(pack_u8_1);
702 break;
703 }
704 }
705
706 // Write out result
707 switch(key.u.outVecSize) {
708 case 3:
709 case 2:
710 ADD_CHUNK(store_u8_4);
711 break;
712 case 1:
713 ADD_CHUNK(store_u8_2);
714 break;
715 case 0:
716 ADD_CHUNK(store_u8_1);
717 break;
718 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700719 }
720
Jason Sams2b0d8e62013-08-29 16:41:01 -0700721 if (key.u.inType != key.u.outType) {
722 key.u.copyAlpha = 0;
723 key.u.dot = 0;
724 }
725
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700726 // Loop, branch, and cleanup
727 ADD_CHUNK(postfix1);
728 buf = addBranch(buf, buf2, 0x01);
729 ADD_CHUNK(postfix2);
730
731 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
732 if (ret == -1) {
733 ALOGE("mprotect error %i", ret);
734 return false;
735 }
736
Narayan Kamath72f5f8c2014-03-11 12:23:29 +0000737 FLUSH_CPU_CACHE(mBuf, (char*) mBuf + mBufSize);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700738 return true;
739#else
740 return false;
741#endif
742}
743
Jason Samsec3cd2d2013-09-11 18:08:47 -0700744void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700745 for(int ct=0; ct < 16; ct++) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700746 ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
747 tmpFp[ct] = fp[ct] * fpMul;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700748 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]);
Jason Sams2b0d8e62013-08-29 16:41:01 -0700749 }
750
Jason Samsec3cd2d2013-09-11 18:08:47 -0700751 float add = 0.f;
752 if (fpMul > 254.f) add = 0.5f;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700753 for(int ct=0; ct < 4; ct++) {
Simon Hosie0462a392014-03-07 19:36:44 -0800754 tmpFpa[ct] = fpa[ct] * addMul + add;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700755 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
Jason Sams2b0d8e62013-08-29 16:41:01 -0700756 }
757
Jason Samsec3cd2d2013-09-11 18:08:47 -0700758 for(int ct=0; ct < 4; ct++) {
Simon Hosie0462a392014-03-07 19:36:44 -0800759 ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
Jason Sams2b0d8e62013-08-29 16:41:01 -0700760 }
Jason Sams2b0d8e62013-08-29 16:41:01 -0700761}
762
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700763void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
764 size_t dataLength) {
765 switch(slot) {
766 case 0:
Jason Sams2b0d8e62013-08-29 16:41:01 -0700767 memcpy (fp, data, sizeof(fp));
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700768 break;
769 case 1:
Jason Sams2b0d8e62013-08-29 16:41:01 -0700770 memcpy (fpa, data, sizeof(fpa));
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700771 break;
772 default:
773 rsAssert(0);
774 break;
775 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700776 mRootPtr = &kernel;
777}
778
Jason Sams709a0972012-11-15 18:18:04 -0800779
Chris Wailes80ef6932014-07-08 11:22:18 -0700780static void One(const RsExpandKernelParams *p, void *out,
Jason Sams17e3cdc2013-09-09 17:32:16 -0700781 const void *py, const float* coeff, const float *add,
Jason Samsa65de102013-08-09 13:42:28 -0700782 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
783
784 float4 f = 0.f;
785 if (fin) {
786 switch(vsin) {
787 case 3:
Jason Sams9e4a96a2013-09-11 15:52:22 -0700788 f = ((const float4 *)py)[0];
789 break;
Jason Samsa65de102013-08-09 13:42:28 -0700790 case 2:
791 f = ((const float4 *)py)[0];
Jason Sams9e4a96a2013-09-11 15:52:22 -0700792 f.w = 0.f;
Jason Samsa65de102013-08-09 13:42:28 -0700793 break;
794 case 1:
795 f.xy = ((const float2 *)py)[0];
796 break;
797 case 0:
798 f.x = ((const float *)py)[0];
799 break;
800 }
801 } else {
802 switch(vsin) {
803 case 3:
Jason Sams9e4a96a2013-09-11 15:52:22 -0700804 f = convert_float4(((const uchar4 *)py)[0]);
805 break;
Jason Samsa65de102013-08-09 13:42:28 -0700806 case 2:
807 f = convert_float4(((const uchar4 *)py)[0]);
Jason Sams9e4a96a2013-09-11 15:52:22 -0700808 f.w = 0.f;
Jason Samsa65de102013-08-09 13:42:28 -0700809 break;
810 case 1:
Jason Sams68c81722013-08-21 16:58:27 -0700811 f.xy = convert_float2(((const uchar2 *)py)[0]);
Jason Samsa65de102013-08-09 13:42:28 -0700812 break;
813 case 0:
Jason Sams68c81722013-08-21 16:58:27 -0700814 f.x = (float)(((const uchar *)py)[0]);
Jason Samsa65de102013-08-09 13:42:28 -0700815 break;
816 }
817 }
Jason Sams2b0d8e62013-08-29 16:41:01 -0700818 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
Jason Sams709a0972012-11-15 18:18:04 -0800819
820 float4 sum;
Jason Samsa65de102013-08-09 13:42:28 -0700821 sum.x = f.x * coeff[0] +
822 f.y * coeff[4] +
823 f.z * coeff[8] +
824 f.w * coeff[12];
825 sum.y = f.x * coeff[1] +
826 f.y * coeff[5] +
827 f.z * coeff[9] +
828 f.w * coeff[13];
829 sum.z = f.x * coeff[2] +
830 f.y * coeff[6] +
831 f.z * coeff[10] +
832 f.w * coeff[14];
833 sum.w = f.x * coeff[3] +
834 f.y * coeff[7] +
835 f.z * coeff[11] +
836 f.w * coeff[15];
Jason Sams2b0d8e62013-08-29 16:41:01 -0700837 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
Jason Sams709a0972012-11-15 18:18:04 -0800838
Jason Sams17e3cdc2013-09-09 17:32:16 -0700839 sum.x += add[0];
Simon Hosie0462a392014-03-07 19:36:44 -0800840 sum.y += add[1];
841 sum.z += add[2];
842 sum.w += add[3];
Jason Sams17e3cdc2013-09-09 17:32:16 -0700843
Jason Sams709a0972012-11-15 18:18:04 -0800844
Jason Sams2b0d8e62013-08-29 16:41:01 -0700845 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
Jason Samsa65de102013-08-09 13:42:28 -0700846 if (fout) {
847 switch(vsout) {
848 case 3:
849 case 2:
850 ((float4 *)out)[0] = sum;
851 break;
852 case 1:
853 ((float2 *)out)[0] = sum.xy;
854 break;
855 case 0:
856 ((float *)out)[0] = sum.x;
857 break;
858 }
859 } else {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700860 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
861 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
862 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
863 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
Jason Sams17e3cdc2013-09-09 17:32:16 -0700864
Jason Samsa65de102013-08-09 13:42:28 -0700865 switch(vsout) {
866 case 3:
867 case 2:
868 ((uchar4 *)out)[0] = convert_uchar4(sum);
869 break;
870 case 1:
871 ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
872 break;
873 case 0:
874 ((uchar *)out)[0] = sum.x;
875 break;
876 }
877 }
Jason Sams2b0d8e62013-08-29 16:41:01 -0700878 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
Jason Sams709a0972012-11-15 18:18:04 -0800879}
880
Chris Wailes80ef6932014-07-08 11:22:18 -0700881void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700882 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700883 uint32_t outstep) {
Jason Sams709a0972012-11-15 18:18:04 -0800884 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
Chris Wailesf3712132014-07-16 15:18:30 -0700885
Chris Wailes9ed79102014-07-25 15:53:28 -0700886 uint32_t instep = p->inEStrides[0];
Chris Wailesf3712132014-07-16 15:18:30 -0700887
888 uchar *out = (uchar *)p->out + outstep * xstart;
889 uchar *in = (uchar *)p->ins[0] + instep * xstart;
890
Jason Sams709a0972012-11-15 18:18:04 -0800891 uint32_t x1 = xstart;
892 uint32_t x2 = xend;
893
Jason Samsa65de102013-08-09 13:42:28 -0700894 uint32_t vsin = cp->mLastKey.u.inVecSize;
895 uint32_t vsout = cp->mLastKey.u.outVecSize;
896 bool floatIn = !!cp->mLastKey.u.inType;
897 bool floatOut = !!cp->mLastKey.u.outType;
898
Jason Sams9e4a96a2013-09-11 15:52:22 -0700899 //if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
900
Jason Sams709a0972012-11-15 18:18:04 -0800901 if(x2 > x1) {
Simon Hosie0462a392014-03-07 19:36:44 -0800902 int32_t len = x2 - x1;
903 if (gArchUseSIMD) {
904 if((cp->mOptKernel != NULL) && (len >= 4)) {
Jason Sams858d0352014-04-29 18:10:50 -0700905 // The optimized kernel processes 4 pixels at once
906 // and requires a minimum of 1 chunk of 4
Simon Hosie0462a392014-03-07 19:36:44 -0800907 cp->mOptKernel(out, in, cp->ip, len >> 2);
Jason Sams858d0352014-04-29 18:10:50 -0700908 // Update the len and pointers so the generic code can
909 // finish any leftover pixels
Jason Sams98dd4bb2014-04-29 15:30:30 -0700910 len &= ~3;
Simon Hosie0462a392014-03-07 19:36:44 -0800911 x1 += len;
912 out += outstep * len;
913 in += instep * len;
914 }
Tim Murray6a45ddb2014-08-06 11:49:02 -0700915#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
Simon Hosie0462a392014-03-07 19:36:44 -0800916 else {
Simon Hosie0462a392014-03-07 19:36:44 -0800917 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
Simon Hosie6e7e2582014-05-06 01:07:21 -0700918 rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
Simon Hosie0462a392014-03-07 19:36:44 -0800919 } else {
Simon Hosie6e7e2582014-05-06 01:07:21 -0700920 rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
Simon Hosie0462a392014-03-07 19:36:44 -0800921 }
Simon Hosie6e7e2582014-05-06 01:07:21 -0700922 x1 += len;
923 out += outstep * len;
924 in += instep * len;
Simon Hosie0462a392014-03-07 19:36:44 -0800925 }
926#endif
Jason Sams709a0972012-11-15 18:18:04 -0800927 }
Jason Sams709a0972012-11-15 18:18:04 -0800928
929 while(x1 != x2) {
Jason Samsec3cd2d2013-09-11 18:08:47 -0700930 One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
Jason Sams68c81722013-08-21 16:58:27 -0700931 out += outstep;
932 in += instep;
Jason Sams709a0972012-11-15 18:18:04 -0800933 x1++;
934 }
935 }
936}
937
Chris Wailesf3712132014-07-16 15:18:30 -0700938void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
939 const Allocation ** ains,
940 uint32_t inLen,
941 Allocation * aout,
942 const void * usr,
943 uint32_t usrLen,
944 const RsScriptCall *sc) {
Jason Sams709a0972012-11-15 18:18:04 -0800945
Chris Wailesf3712132014-07-16 15:18:30 -0700946 const Element *ein = ains[0]->mHal.state.type->getElement();
Jason Sams17e3cdc2013-09-09 17:32:16 -0700947 const Element *eout = aout->mHal.state.type->getElement();
948
949 if (ein->getType() == eout->getType()) {
Jason Samsec3cd2d2013-09-11 18:08:47 -0700950 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
951 updateCoeffCache(1.f, 255.f);
952 } else {
953 updateCoeffCache(1.f, 1.f);
954 }
Jason Sams17e3cdc2013-09-09 17:32:16 -0700955 } else {
956 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700957 updateCoeffCache(255.f, 255.f);
Jason Sams17e3cdc2013-09-09 17:32:16 -0700958 } else {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700959 updateCoeffCache(1.f / 255.f, 1.f);
Jason Sams17e3cdc2013-09-09 17:32:16 -0700960 }
961 }
962
Chris Wailesf3712132014-07-16 15:18:30 -0700963 Key_t key = computeKey(ein, eout);
964
Rose, James7b7060c2014-04-22 12:08:06 +0800965#if defined(ARCH_X86_HAVE_SSSE3)
966 if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
967 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
968 // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
969 mLastKey = key;
970 }
971
972#else //if !defined(ARCH_X86_HAVE_SSSE3)
Jason Samsa65de102013-08-09 13:42:28 -0700973 if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700974 if (mBuf) munmap(mBuf, mBufSize);
975 mBuf = NULL;
976 mOptKernel = NULL;
977 if (build(key)) {
978 mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
Jason Sams709a0972012-11-15 18:18:04 -0800979 }
Tim Murray6a45ddb2014-08-06 11:49:02 -0700980#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
Simon Hosie0462a392014-03-07 19:36:44 -0800981 else {
982 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
983 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
984 uint32_t mm = 0;
985 int i;
986 for (i = 0; i < 4; i++)
987 {
988 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
989 m = ((m * 0x249) >> 9) & 15;
990 m |= ((key.u.addMask >> i) & 1) << 4;
991 mm |= m << (i * 5);
992 }
993
994 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
995 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
996 } else {
997 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
998 }
999 }
1000#endif
1001 mLastKey = key;
Jason Sams709a0972012-11-15 18:18:04 -08001002 }
Rose, James7b7060c2014-04-22 12:08:06 +08001003#endif //if !defined(ARCH_X86_HAVE_SSSE3)
Jason Sams709a0972012-11-15 18:18:04 -08001004}
1005
Jason Sams709a0972012-11-15 18:18:04 -08001006RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
Jason Samsc905efd2012-11-26 15:20:18 -08001007 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1008 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
Jason Sams709a0972012-11-15 18:18:04 -08001009
Jason Samsa65de102013-08-09 13:42:28 -07001010 mLastKey.key = 0;
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001011 mBuf = NULL;
1012 mBufSize = 0;
1013 mOptKernel = NULL;
Jason Sams709a0972012-11-15 18:18:04 -08001014 const static float defaultMatrix[] = {
1015 1.f, 0.f, 0.f, 0.f,
1016 0.f, 1.f, 0.f, 0.f,
1017 0.f, 0.f, 1.f, 0.f,
1018 0.f, 0.f, 0.f, 1.f
1019 };
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001020 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
Jason Sams709a0972012-11-15 18:18:04 -08001021 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001022 setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
Jason Sams709a0972012-11-15 18:18:04 -08001023}
1024
1025RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001026 if (mBuf) munmap(mBuf, mBufSize);
1027 mBuf = NULL;
1028 mOptKernel = NULL;
Jason Sams709a0972012-11-15 18:18:04 -08001029}
1030
1031void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001032 s->mHal.info.exportedVariableCount = 2;
Jason Sams709a0972012-11-15 18:18:04 -08001033}
1034
Jason Samsc905efd2012-11-26 15:20:18 -08001035RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1036 const Script *s, const Element *e) {
Jason Sams709a0972012-11-15 18:18:04 -08001037
Jason Samsc905efd2012-11-26 15:20:18 -08001038 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
Jason Sams709a0972012-11-15 18:18:04 -08001039}