blob: 5638f8492df380812e825e5a2e6d3bc73fc37184 [file] [log] [blame]
Jason Sams709a0972012-11-15 18:18:04 -08001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Jason Sams9b2b9ef2013-07-29 17:38:00 -070017#include <sys/mman.h>
18#include <unistd.h>
Jason Sams709a0972012-11-15 18:18:04 -080019
20#include "rsCpuIntrinsic.h"
21#include "rsCpuIntrinsicInlines.h"
Jason Sams9b2b9ef2013-07-29 17:38:00 -070022
23#include <sys/mman.h>
24#include <stddef.h>
25#include <stdint.h>
26#include <stdlib.h>
27//#include <utils/StopWatch.h>
28
Jason Sams709a0972012-11-15 18:18:04 -080029
Jason Samsa65de102013-08-09 13:42:28 -070030/* uint kernel
31 * Q0 D0: Load slot for R
32 * D1: Load slot for G
33 * Q1 D2: Load slot for B
34 * D3: Load slot for A
35 * Q2 D4: Matrix
36 * D5: =
37 * Q3 D6: =
38 * D7: =
39 * Q4 D8: Add R
40 * D9:
41 * Q5 D10: Add G
42 * D11:
43 * Q6 D12: Add B
44 * D13:
45 * Q7 D14: Add A
46 * D15:
47 * Q8 D16: I32: R Sum
48 * D17:
49 * Q9 D18: I32: G Sum
50 * D19:
51 * Q10 D20: I32: B Sum
52 * D21:
53 * Q11 D22: I32: A Sum
54 * D23:
55 * Q12 D24: U16: expanded R
56 * D25:
57 * Q13 D26: U16: expanded G
58 * D27:
59 * Q14 D28: U16: expanded B
60 * D29:
61 * Q15 D30: U16: expanded A
62 * D31:
63 *
64 */
65
66/* float kernel
67 * Q0 D0: Load slot for R
68 * D1: =
69 * Q1 D2: Load slot for G
70 * D3: =
71 * Q2 D4: Load slot for B
72 * D5: =
73 * Q3 D6: Load slot for A
74 * D7: =
75 * Q4 D8: Matrix
76 * D9: =
77 * Q5 D10: =
78 * D11: =
79 * Q6 D12: =
80 * D13: =
81 * Q7 D14: =
82 * D15: =
83 * Q8 D16: Add R
84 * D17: =
85 * Q9 D18: Add G
86 * D19: =
87 * Q10 D20: Add B
88 * D21: =
89 * Q11 D22: Add A
90 * D23: =
91 * Q12 D24: Sum R
92 * D25: =
93 * Q13 D26: Sum G
94 * D27: =
95 * Q14 D28: Sum B
96 * D29: =
97 * Q15 D30: Sum A
98 * D31: =
99 *
100 */
101
102
103
Jason Sams709a0972012-11-15 18:18:04 -0800104using namespace android;
105using namespace android::renderscript;
106
107namespace android {
108namespace renderscript {
109
Jason Samsa65de102013-08-09 13:42:28 -0700110typedef union {
111 uint64_t key;
112 struct {
113 uint32_t inVecSize :2; // [0 - 1]
114 uint32_t outVecSize :2; // [2 - 3]
115 uint32_t inType :4; // [4 - 7]
116 uint32_t outType :4; // [8 - 11]
117 uint32_t dot :1; // [12]
118 uint32_t _unused1 :1; // [13]
119 uint32_t copyAlpha :1; // [14]
120 uint32_t _unused2 :1; // [15]
121 uint32_t coeffMask :16; // [16-31]
122 uint32_t addMask :4; // [32-35]
123 } u;
124} Key_t;
Jason Sams709a0972012-11-15 18:18:04 -0800125
Tim Murray6a45ddb2014-08-06 11:49:02 -0700126//Re-enable when intrinsic is fixed
Jason Sams32f9d042014-10-22 17:25:51 -0700127#if defined(ARCH_ARM64_USE_INTRINSICS)
Simon Hosie0462a392014-03-07 19:36:44 -0800128typedef struct {
129 void (*column[4])(void);
130 void (*store)(void);
131 void (*load)(void);
Simon Hosie6e7e2582014-05-06 01:07:21 -0700132 void (*store_end)(void);
133 void (*load_end)(void);
Simon Hosie0462a392014-03-07 19:36:44 -0800134} FunctionTab_t;
135
Simon Hosie6e7e2582014-05-06 01:07:21 -0700136extern "C" void rsdIntrinsicColorMatrix_int_K(
Simon Hosie0462a392014-03-07 19:36:44 -0800137 void *out, void const *in, size_t count,
138 FunctionTab_t const *fns,
139 int16_t const *mult, int32_t const *add);
140
Simon Hosie6e7e2582014-05-06 01:07:21 -0700141extern "C" void rsdIntrinsicColorMatrix_float_K(
Simon Hosie0462a392014-03-07 19:36:44 -0800142 void *out, void const *in, size_t count,
143 FunctionTab_t const *fns,
144 float const *mult, float const *add);
145
Simon Hosie6e7e2582014-05-06 01:07:21 -0700146/* The setup functions fill in function tables to be used by above functions;
147 * this code also eliminates jump-to-another-jump cases by short-circuiting
148 * empty functions. While it's not performance critical, it works out easier
149 * to write the set-up code in assembly than to try to expose the same symbols
150 * and write the code in C.
151 */
152extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
153 FunctionTab_t *fns,
154 uint32_t mask, int dt, int st);
155
Simon Hosie0462a392014-03-07 19:36:44 -0800156extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
Simon Hosie6e7e2582014-05-06 01:07:21 -0700157 FunctionTab_t *fns,
Simon Hosie0462a392014-03-07 19:36:44 -0800158 uint32_t mask, int dt, int st);
159#endif
160
Jason Sams709a0972012-11-15 18:18:04 -0800161class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
162public:
Stephen Hinesc060f142015-05-13 19:26:09 -0700163 void populateScript(Script *) override;
Jason Sams709a0972012-11-15 18:18:04 -0800164
Stephen Hinesc060f142015-05-13 19:26:09 -0700165 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
Jason Sams709a0972012-11-15 18:18:04 -0800166
Stephen Hinesc060f142015-05-13 19:26:09 -0700167 ~RsdCpuScriptIntrinsicColorMatrix() override;
Jason Samsc905efd2012-11-26 15:20:18 -0800168 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
Jason Sams709a0972012-11-15 18:18:04 -0800169
Stephen Hinesc060f142015-05-13 19:26:09 -0700170 void preLaunch(uint32_t slot, const Allocation ** ains,
171 uint32_t inLen, Allocation * aout, const void * usr,
172 uint32_t usrLen, const RsScriptCall *sc) override;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700173
Jason Sams709a0972012-11-15 18:18:04 -0800174protected:
175 float fp[16];
Jason Sams2b0d8e62013-08-29 16:41:01 -0700176 float fpa[4];
Jason Samsa65de102013-08-09 13:42:28 -0700177
Jason Sams2b0d8e62013-08-29 16:41:01 -0700178 // The following four fields are read as constants
179 // by the SIMD assembly code.
Jason Sams709a0972012-11-15 18:18:04 -0800180 short ip[16];
Simon Hosie0462a392014-03-07 19:36:44 -0800181 int ipa[4];
Jason Sams2b0d8e62013-08-29 16:41:01 -0700182 float tmpFp[16];
Simon Hosie0462a392014-03-07 19:36:44 -0800183 float tmpFpa[4];
Jason Sams32f9d042014-10-22 17:25:51 -0700184#if defined(ARCH_ARM64_USE_INTRINSICS)
Simon Hosie0462a392014-03-07 19:36:44 -0800185 FunctionTab_t mFnTab;
186#endif
Jason Sams709a0972012-11-15 18:18:04 -0800187
David Grossb0abb142015-03-12 15:23:03 -0700188 static void kernel(const RsExpandKernelDriverInfo *info,
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700189 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700190 uint32_t outstep);
Jason Sams9e4a96a2013-09-11 15:52:22 -0700191 void updateCoeffCache(float fpMul, float addMul);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700192
Jason Samsa65de102013-08-09 13:42:28 -0700193 Key_t mLastKey;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700194 unsigned char *mBuf;
195 size_t mBufSize;
196
Jason Samsa65de102013-08-09 13:42:28 -0700197 Key_t computeKey(const Element *ein, const Element *eout);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700198
Jason Samsa65de102013-08-09 13:42:28 -0700199 bool build(Key_t key);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700200
201 void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
202
Jason Sams709a0972012-11-15 18:18:04 -0800203};
204
205}
206}
207
208
Jason Samsa65de102013-08-09 13:42:28 -0700209Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700210 const Element *ein, const Element *eout) {
211
Jason Samsa65de102013-08-09 13:42:28 -0700212 Key_t key;
213 key.key = 0;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700214
215 // Compute a unique code key for this operation
216
217 // Add to the key the input and output types
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700218 bool hasFloat = false;
219 if (ein->getType() == RS_TYPE_FLOAT_32) {
220 hasFloat = true;
Jason Samsa65de102013-08-09 13:42:28 -0700221 key.u.inType = RS_TYPE_FLOAT_32;
222 rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700223 }
224 if (eout->getType() == RS_TYPE_FLOAT_32) {
225 hasFloat = true;
Jason Samsa65de102013-08-09 13:42:28 -0700226 key.u.outType = RS_TYPE_FLOAT_32;
227 rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
Jason Sams709a0972012-11-15 18:18:04 -0800228 }
229
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700230 // Mask in the bits indicating which coefficients in the
231 // color matrix are needed.
232 if (hasFloat) {
233 for (uint32_t i=0; i < 16; i++) {
234 if (fabs(fp[i]) != 0.f) {
Jason Samsa65de102013-08-09 13:42:28 -0700235 key.u.coeffMask |= 1 << i;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700236 }
237 }
Jason Samsa65de102013-08-09 13:42:28 -0700238 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700239 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
240 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
241 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
Jason Samsa65de102013-08-09 13:42:28 -0700242
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700243 } else {
244 for (uint32_t i=0; i < 16; i++) {
245 if (ip[i] != 0) {
Jason Samsa65de102013-08-09 13:42:28 -0700246 key.u.coeffMask |= 1 << i;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700247 }
Jason Sams709a0972012-11-15 18:18:04 -0800248 }
Jason Samsa65de102013-08-09 13:42:28 -0700249 if (ipa[0] != 0) key.u.addMask |= 0x1;
Simon Hosie0462a392014-03-07 19:36:44 -0800250 if (ipa[1] != 0) key.u.addMask |= 0x2;
251 if (ipa[2] != 0) key.u.addMask |= 0x4;
252 if (ipa[3] != 0) key.u.addMask |= 0x8;
Jason Sams709a0972012-11-15 18:18:04 -0800253 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700254
255 // Look for a dot product where the r,g,b colums are the same
256 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
257 (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
258 (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
259 (ip[12] == ip[13]) && (ip[12] == ip[14])) {
260
Jason Samsa65de102013-08-09 13:42:28 -0700261 if (!key.u.addMask) key.u.dot = 1;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700262 }
263
264 // Is alpha a simple copy
Jason Samsa65de102013-08-09 13:42:28 -0700265 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700266 key.u.copyAlpha = !(key.u.inType || key.u.outType);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700267 }
268
Jason Samsa65de102013-08-09 13:42:28 -0700269 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
270
271 switch (ein->getVectorSize()) {
272 case 4:
273 key.u.inVecSize = 3;
274 break;
275 case 3:
276 key.u.inVecSize = 2;
277 key.u.coeffMask &= ~0xF000;
278 break;
279 case 2:
280 key.u.inVecSize = 1;
281 key.u.coeffMask &= ~0xFF00;
282 break;
283 default:
284 key.u.coeffMask &= ~0xFFF0;
285 break;
286 }
287
288 switch (eout->getVectorSize()) {
289 case 4:
290 key.u.outVecSize = 3;
291 break;
292 case 3:
293 key.u.outVecSize = 2;
294 key.u.coeffMask &= ~0x8888;
Simon Hosie0462a392014-03-07 19:36:44 -0800295 key.u.addMask &= 7;
Jason Samsa65de102013-08-09 13:42:28 -0700296 break;
297 case 2:
298 key.u.outVecSize = 1;
299 key.u.coeffMask &= ~0xCCCC;
Simon Hosie0462a392014-03-07 19:36:44 -0800300 key.u.addMask &= 3;
Jason Samsa65de102013-08-09 13:42:28 -0700301 break;
302 default:
303 key.u.coeffMask &= ~0xEEEE;
Simon Hosie0462a392014-03-07 19:36:44 -0800304 key.u.addMask &= 1;
Jason Samsa65de102013-08-09 13:42:28 -0700305 break;
306 }
307
Jason Sams9e4a96a2013-09-11 15:52:22 -0700308 if (key.u.inType && !key.u.outType) {
309 key.u.addMask |= 1;
310 if (key.u.outVecSize > 0) key.u.addMask |= 2;
311 if (key.u.outVecSize > 1) key.u.addMask |= 4;
312 if (key.u.outVecSize > 2) key.u.addMask |= 8;
313 }
314
Jason Samsa65de102013-08-09 13:42:28 -0700315 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700316 return key;
Jason Sams709a0972012-11-15 18:18:04 -0800317}
318
Jason Sams074424a2014-05-22 13:30:03 -0700319#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700320
321#define DEF_SYM(x) \
322 extern "C" uint32_t _N_ColorMatrix_##x; \
323 extern "C" uint32_t _N_ColorMatrix_##x##_end; \
324 extern "C" uint32_t _N_ColorMatrix_##x##_len;
325
Jason Samsa65de102013-08-09 13:42:28 -0700326DEF_SYM(prefix_i)
327DEF_SYM(prefix_f)
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700328DEF_SYM(postfix1)
329DEF_SYM(postfix2)
Jason Samsa65de102013-08-09 13:42:28 -0700330
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700331DEF_SYM(load_u8_4)
Jason Sams9e4a96a2013-09-11 15:52:22 -0700332DEF_SYM(load_u8_3)
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700333DEF_SYM(load_u8_2)
334DEF_SYM(load_u8_1)
Jason Samsa65de102013-08-09 13:42:28 -0700335DEF_SYM(load_u8f_4)
Jason Sams9e4a96a2013-09-11 15:52:22 -0700336DEF_SYM(load_u8f_3)
Jason Samsa65de102013-08-09 13:42:28 -0700337DEF_SYM(load_u8f_2)
338DEF_SYM(load_u8f_1)
339DEF_SYM(load_f32_4)
Jason Sams9e4a96a2013-09-11 15:52:22 -0700340DEF_SYM(load_f32_3)
Jason Samsa65de102013-08-09 13:42:28 -0700341DEF_SYM(load_f32_2)
342DEF_SYM(load_f32_1)
343
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700344DEF_SYM(store_u8_4)
345DEF_SYM(store_u8_2)
346DEF_SYM(store_u8_1)
Jason Samsa65de102013-08-09 13:42:28 -0700347DEF_SYM(store_f32_4)
Jason Sams9e4a96a2013-09-11 15:52:22 -0700348DEF_SYM(store_f32_3)
Jason Samsa65de102013-08-09 13:42:28 -0700349DEF_SYM(store_f32_2)
350DEF_SYM(store_f32_1)
Jason Sams2b0d8e62013-08-29 16:41:01 -0700351DEF_SYM(store_f32u_4)
352DEF_SYM(store_f32u_2)
353DEF_SYM(store_f32u_1)
354
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700355DEF_SYM(unpack_u8_4)
356DEF_SYM(unpack_u8_3)
357DEF_SYM(unpack_u8_2)
358DEF_SYM(unpack_u8_1)
359DEF_SYM(pack_u8_4)
360DEF_SYM(pack_u8_3)
361DEF_SYM(pack_u8_2)
362DEF_SYM(pack_u8_1)
363DEF_SYM(dot)
364DEF_SYM(add_0_u8)
365DEF_SYM(add_1_u8)
366DEF_SYM(add_2_u8)
367DEF_SYM(add_3_u8)
368
369#define ADD_CHUNK(x) \
370 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
371 buf += _N_ColorMatrix_##x##_len
372
373
374static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
375 size_t off = (target - buf - 8) >> 2;
376 rsAssert(((off & 0xff000000) == 0) ||
377 ((off & 0xff000000) == 0xff000000));
378
379 uint32_t op = (condition << 28);
380 op |= 0xa << 24; // branch
381 op |= 0xffffff & off;
382 ((uint32_t *)buf)[0] = op;
383 return buf + 4;
384}
385
Jason Sams2b0d8e62013-08-29 16:41:01 -0700386static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700387 rsAssert(vd < 32);
388 rsAssert(vm < 32);
389 rsAssert(vn < 32);
390
391 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
392 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
393 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700394 return op;
395}
396
397static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
398 //vmlal.s16 Q#1, D#1, D#2[#]
Jason Sams2b0d8e62013-08-29 16:41:01 -0700399 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700400 ((uint32_t *)buf)[0] = op;
401 return buf + 4;
402}
403
404static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
405 //vmull.s16 Q#1, D#1, D#2[#]
Jason Sams2b0d8e62013-08-29 16:41:01 -0700406 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700407 ((uint32_t *)buf)[0] = op;
408 return buf + 4;
409}
Jason Samsa65de102013-08-09 13:42:28 -0700410
411static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
Simon Hosiec7c255e2014-03-07 16:23:12 -0800412 //vqadd.s32 Q#1, Q#1, Q#2
Jason Sams2b0d8e62013-08-29 16:41:01 -0700413 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
Jason Samsa65de102013-08-09 13:42:28 -0700414 ((uint32_t *)buf)[0] = op;
415 return buf + 4;
416}
417
418static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
419 //vmlal.f32 Q#1, D#1, D#2[#]
Jason Sams2b0d8e62013-08-29 16:41:01 -0700420 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
Jason Samsa65de102013-08-09 13:42:28 -0700421 ((uint32_t *)buf)[0] = op;
422 return buf + 4;
423}
424
425static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
426 //vmull.f32 Q#1, D#1, D#2[#]
Jason Sams2b0d8e62013-08-29 16:41:01 -0700427 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
428 ((uint32_t *)buf)[0] = op;
429 return buf + 4;
430}
431
432static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
433 //vadd.f32 Q#1, D#1, D#2
434 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
Jason Samsa65de102013-08-09 13:42:28 -0700435 ((uint32_t *)buf)[0] = op;
436 return buf + 4;
437}
438
Simon Hosiec7c255e2014-03-07 16:23:12 -0800439static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
440 //vmov.32 Q#1, #imm
441 rsAssert(imm == 0);
442 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
443 ((uint32_t *)buf)[0] = op;
444 return buf + 4;
445}
446
Jason Samsa65de102013-08-09 13:42:28 -0700447static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
448 //vadd.f32 Q#1, D#1, D#2
Jason Sams2b0d8e62013-08-29 16:41:01 -0700449 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
Jason Samsa65de102013-08-09 13:42:28 -0700450 ((uint32_t *)buf)[0] = op;
451 return buf + 4;
452}
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700453#endif
454
Rose, James7b7060c2014-04-22 12:08:06 +0800455#if defined(ARCH_X86_HAVE_SSSE3)
Dan Albertebf0eb92014-08-22 13:19:24 -0700456extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
Rose, James7b7060c2014-04-22 12:08:06 +0800457 const short *coef, uint32_t count);
Dan Albertebf0eb92014-08-22 13:19:24 -0700458extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
Rose, James7b7060c2014-04-22 12:08:06 +0800459 const short *coef, uint32_t count);
Dan Albertebf0eb92014-08-22 13:19:24 -0700460extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
Rose, James7b7060c2014-04-22 12:08:06 +0800461 const short *coef, uint32_t count);
462
463void * selectKernel(Key_t key)
464{
Chris Wailes44bef6f2014-08-12 13:51:10 -0700465 void * kernel = nullptr;
Rose, James7b7060c2014-04-22 12:08:06 +0800466
467 // inType, outType float if nonzero
468 if (!(key.u.inType || key.u.outType)) {
469 if (key.u.dot)
470 kernel = (void *)rsdIntrinsicColorMatrixDot_K;
471 else if (key.u.copyAlpha)
472 kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
473 else
474 kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
475 }
476
477 return kernel;
478}
479#endif
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700480
Jason Samsa65de102013-08-09 13:42:28 -0700481bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
Jason Sams074424a2014-05-22 13:30:03 -0700482#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700483 mBufSize = 4096;
484 //StopWatch build_time("rs cm: build time");
485 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
486 MAP_PRIVATE | MAP_ANON, -1, 0);
Jason Samsc214fe52014-10-08 15:57:34 -0700487 if (mBuf == MAP_FAILED) {
488 mBuf = NULL;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700489 return false;
490 }
491
492 uint8_t *buf = mBuf;
Chris Wailes44bef6f2014-08-12 13:51:10 -0700493 uint8_t *buf2 = nullptr;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700494
Jason Samsa65de102013-08-09 13:42:28 -0700495 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
496 int opInit[4] = {0, 0, 0, 0};
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700497
Jason Samsa65de102013-08-09 13:42:28 -0700498 memset(ops, 0, sizeof(ops));
499 for (int i=0; i < 4; i++) {
500 if (key.u.coeffMask & (1 << (i*4))) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700501 ops[i][0] = 0x2 | opInit[0];
Jason Samsa65de102013-08-09 13:42:28 -0700502 opInit[0] = 1;
503 }
504 if (!key.u.dot) {
505 if (key.u.coeffMask & (1 << (1 + i*4))) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700506 ops[i][1] = 0x2 | opInit[1];
Jason Samsa65de102013-08-09 13:42:28 -0700507 opInit[1] = 1;
508 }
509 if (key.u.coeffMask & (1 << (2 + i*4))) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700510 ops[i][2] = 0x2 | opInit[2];
Jason Samsa65de102013-08-09 13:42:28 -0700511 opInit[2] = 1;
512 }
513 }
514 if (!key.u.copyAlpha) {
515 if (key.u.coeffMask & (1 << (3 + i*4))) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700516 ops[i][3] = 0x2 | opInit[3];
Jason Samsa65de102013-08-09 13:42:28 -0700517 opInit[3] = 1;
518 }
519 }
520 }
Jason Samsa65de102013-08-09 13:42:28 -0700521
Jason Samsa65de102013-08-09 13:42:28 -0700522 if (key.u.inType || key.u.outType) {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700523 key.u.copyAlpha = 0;
Jason Samsa65de102013-08-09 13:42:28 -0700524 ADD_CHUNK(prefix_f);
525 buf2 = buf;
526
527 // Load the incoming r,g,b,a as needed
528 if (key.u.inType) {
529 switch(key.u.inVecSize) {
530 case 3:
Jason Samsa65de102013-08-09 13:42:28 -0700531 ADD_CHUNK(load_f32_4);
532 break;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700533 case 2:
534 ADD_CHUNK(load_f32_3);
535 break;
Jason Samsa65de102013-08-09 13:42:28 -0700536 case 1:
537 ADD_CHUNK(load_f32_2);
538 break;
539 case 0:
540 ADD_CHUNK(load_f32_1);
541 break;
542 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700543 } else {
Jason Samsa65de102013-08-09 13:42:28 -0700544 switch(key.u.inVecSize) {
545 case 3:
Jason Samsa65de102013-08-09 13:42:28 -0700546 ADD_CHUNK(load_u8f_4);
547 break;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700548 case 2:
549 ADD_CHUNK(load_u8f_3);
550 break;
Jason Samsa65de102013-08-09 13:42:28 -0700551 case 1:
552 ADD_CHUNK(load_u8f_2);
553 break;
554 case 0:
555 ADD_CHUNK(load_u8f_1);
556 break;
557 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700558 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700559
Jason Samsa65de102013-08-09 13:42:28 -0700560 for (int i=0; i < 4; i++) {
561 for (int j=0; j < 4; j++) {
Jason Samsa65de102013-08-09 13:42:28 -0700562 switch(ops[i][j]) {
563 case 0:
564 break;
Jason Samsa65de102013-08-09 13:42:28 -0700565 case 2:
Jason Sams2b0d8e62013-08-29 16:41:01 -0700566 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
567 break;
568 case 3:
569 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
Jason Samsa65de102013-08-09 13:42:28 -0700570 break;
571 }
572 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700573 }
Jason Samsa65de102013-08-09 13:42:28 -0700574 for (int j=0; j < 4; j++) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700575 if (opInit[j]) {
576 if (key.u.addMask & (1 << j)) {
577 buf = addVADD_F32(buf, j, 12+j, 8+j);
578 } else {
579 buf = addVORR_32(buf, j, 12+j, 12+j);
580 }
581 } else {
582 if (key.u.addMask & (1 << j)) {
Simon Hosiec7c255e2014-03-07 16:23:12 -0800583 buf = addVORR_32(buf, j, 8+j, 8+j);
584 } else {
585 buf = addVMOV_32(buf, j, 0);
Jason Sams2b0d8e62013-08-29 16:41:01 -0700586 }
587 }
588 }
589
590 if (key.u.outType) {
591 switch(key.u.outVecSize) {
592 case 3:
Jason Sams2b0d8e62013-08-29 16:41:01 -0700593 ADD_CHUNK(store_f32_4);
594 break;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700595 case 2:
596 ADD_CHUNK(store_f32_3);
597 break;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700598 case 1:
599 ADD_CHUNK(store_f32_2);
600 break;
601 case 0:
602 ADD_CHUNK(store_f32_1);
603 break;
604 }
605 } else {
606 switch(key.u.outVecSize) {
607 case 3:
608 case 2:
609 ADD_CHUNK(store_f32u_4);
610 break;
611 case 1:
612 ADD_CHUNK(store_f32u_2);
613 break;
614 case 0:
615 ADD_CHUNK(store_f32u_1);
616 break;
Jason Samsa65de102013-08-09 13:42:28 -0700617 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700618 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700619
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700620
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700621 } else {
Jason Samsa65de102013-08-09 13:42:28 -0700622 // Add the function prefix
623 // Store the address for the loop return
624 ADD_CHUNK(prefix_i);
625 buf2 = buf;
626
627 // Load the incoming r,g,b,a as needed
628 switch(key.u.inVecSize) {
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700629 case 3:
Jason Samsa65de102013-08-09 13:42:28 -0700630 ADD_CHUNK(load_u8_4);
631 if (key.u.copyAlpha) {
632 ADD_CHUNK(unpack_u8_3);
633 } else {
634 ADD_CHUNK(unpack_u8_4);
635 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700636 break;
637 case 2:
Jason Sams9e4a96a2013-09-11 15:52:22 -0700638 ADD_CHUNK(load_u8_3);
Jason Samsa65de102013-08-09 13:42:28 -0700639 ADD_CHUNK(unpack_u8_3);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700640 break;
641 case 1:
Jason Samsa65de102013-08-09 13:42:28 -0700642 ADD_CHUNK(load_u8_2);
643 ADD_CHUNK(unpack_u8_2);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700644 break;
645 case 0:
Jason Samsa65de102013-08-09 13:42:28 -0700646 ADD_CHUNK(load_u8_1);
647 ADD_CHUNK(unpack_u8_1);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700648 break;
649 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700650
Jason Samsa65de102013-08-09 13:42:28 -0700651 // Add multiply and accumulate
652 // use MULL to init the output register,
653 // use MLAL from there
654 for (int i=0; i < 4; i++) {
655 for (int j=0; j < 4; j++) {
Jason Samsa65de102013-08-09 13:42:28 -0700656 switch(ops[i][j]) {
657 case 0:
658 break;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700659 case 2:
Jason Samsa65de102013-08-09 13:42:28 -0700660 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
661 break;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700662 case 3:
Jason Samsa65de102013-08-09 13:42:28 -0700663 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
664 break;
665 }
666 }
667 }
668 for (int j=0; j < 4; j++) {
Jason Samsec3cd2d2013-09-11 18:08:47 -0700669 if (opInit[j]) {
670 if (key.u.addMask & (1 << j)) {
671 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
672 }
673 } else {
674 if (key.u.addMask & (1 << j)) {
Simon Hosiec7c255e2014-03-07 16:23:12 -0800675 buf = addVORR_32(buf, 8+j, 4+j, 4+j);
Jason Samsec3cd2d2013-09-11 18:08:47 -0700676 }
Jason Samsa65de102013-08-09 13:42:28 -0700677 }
678 }
679
680 // If we have a dot product, perform the special pack.
681 if (key.u.dot) {
682 ADD_CHUNK(pack_u8_1);
683 ADD_CHUNK(dot);
684 } else {
685 switch(key.u.outVecSize) {
686 case 3:
Jason Sams17e3cdc2013-09-09 17:32:16 -0700687 if (key.u.copyAlpha) {
688 ADD_CHUNK(pack_u8_3);
689 } else {
690 ADD_CHUNK(pack_u8_4);
691 }
Jason Samsa65de102013-08-09 13:42:28 -0700692 break;
693 case 2:
694 ADD_CHUNK(pack_u8_3);
695 break;
696 case 1:
697 ADD_CHUNK(pack_u8_2);
698 break;
699 case 0:
700 ADD_CHUNK(pack_u8_1);
701 break;
702 }
703 }
704
705 // Write out result
706 switch(key.u.outVecSize) {
707 case 3:
708 case 2:
709 ADD_CHUNK(store_u8_4);
710 break;
711 case 1:
712 ADD_CHUNK(store_u8_2);
713 break;
714 case 0:
715 ADD_CHUNK(store_u8_1);
716 break;
717 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700718 }
719
Jason Sams2b0d8e62013-08-29 16:41:01 -0700720 if (key.u.inType != key.u.outType) {
721 key.u.copyAlpha = 0;
722 key.u.dot = 0;
723 }
724
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700725 // Loop, branch, and cleanup
726 ADD_CHUNK(postfix1);
727 buf = addBranch(buf, buf2, 0x01);
728 ADD_CHUNK(postfix2);
729
730 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
731 if (ret == -1) {
732 ALOGE("mprotect error %i", ret);
733 return false;
734 }
735
Stephen Hines45e753a2015-01-19 20:58:44 -0800736 __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700737 return true;
738#else
739 return false;
740#endif
741}
742
Jason Samsec3cd2d2013-09-11 18:08:47 -0700743void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700744 for(int ct=0; ct < 16; ct++) {
Jason Sams2b0d8e62013-08-29 16:41:01 -0700745 ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
746 tmpFp[ct] = fp[ct] * fpMul;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700747 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]);
Jason Sams2b0d8e62013-08-29 16:41:01 -0700748 }
749
Jason Samsec3cd2d2013-09-11 18:08:47 -0700750 float add = 0.f;
751 if (fpMul > 254.f) add = 0.5f;
Jason Sams2b0d8e62013-08-29 16:41:01 -0700752 for(int ct=0; ct < 4; ct++) {
Simon Hosie0462a392014-03-07 19:36:44 -0800753 tmpFpa[ct] = fpa[ct] * addMul + add;
Jason Sams9e4a96a2013-09-11 15:52:22 -0700754 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
Jason Sams2b0d8e62013-08-29 16:41:01 -0700755 }
756
Jason Samsec3cd2d2013-09-11 18:08:47 -0700757 for(int ct=0; ct < 4; ct++) {
Simon Hosie0462a392014-03-07 19:36:44 -0800758 ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
Jason Sams2b0d8e62013-08-29 16:41:01 -0700759 }
Jason Sams2b0d8e62013-08-29 16:41:01 -0700760}
761
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700762void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
763 size_t dataLength) {
764 switch(slot) {
765 case 0:
Jason Sams2b0d8e62013-08-29 16:41:01 -0700766 memcpy (fp, data, sizeof(fp));
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700767 break;
768 case 1:
Jason Sams2b0d8e62013-08-29 16:41:01 -0700769 memcpy (fpa, data, sizeof(fpa));
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700770 break;
771 default:
772 rsAssert(0);
773 break;
774 }
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700775 mRootPtr = &kernel;
776}
777
Jason Sams709a0972012-11-15 18:18:04 -0800778
David Grossb0abb142015-03-12 15:23:03 -0700779static void One(const RsExpandKernelDriverInfo *info, void *out,
Jason Sams17e3cdc2013-09-09 17:32:16 -0700780 const void *py, const float* coeff, const float *add,
Jason Samsa65de102013-08-09 13:42:28 -0700781 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
782
783 float4 f = 0.f;
784 if (fin) {
785 switch(vsin) {
786 case 3:
Jason Sams9e4a96a2013-09-11 15:52:22 -0700787 f = ((const float4 *)py)[0];
788 break;
Jason Samsa65de102013-08-09 13:42:28 -0700789 case 2:
790 f = ((const float4 *)py)[0];
Jason Sams9e4a96a2013-09-11 15:52:22 -0700791 f.w = 0.f;
Jason Samsa65de102013-08-09 13:42:28 -0700792 break;
793 case 1:
794 f.xy = ((const float2 *)py)[0];
795 break;
796 case 0:
797 f.x = ((const float *)py)[0];
798 break;
799 }
800 } else {
801 switch(vsin) {
802 case 3:
Jason Sams9e4a96a2013-09-11 15:52:22 -0700803 f = convert_float4(((const uchar4 *)py)[0]);
804 break;
Jason Samsa65de102013-08-09 13:42:28 -0700805 case 2:
806 f = convert_float4(((const uchar4 *)py)[0]);
Jason Sams9e4a96a2013-09-11 15:52:22 -0700807 f.w = 0.f;
Jason Samsa65de102013-08-09 13:42:28 -0700808 break;
809 case 1:
Jason Sams68c81722013-08-21 16:58:27 -0700810 f.xy = convert_float2(((const uchar2 *)py)[0]);
Jason Samsa65de102013-08-09 13:42:28 -0700811 break;
812 case 0:
Jason Sams68c81722013-08-21 16:58:27 -0700813 f.x = (float)(((const uchar *)py)[0]);
Jason Samsa65de102013-08-09 13:42:28 -0700814 break;
815 }
816 }
Jason Sams2b0d8e62013-08-29 16:41:01 -0700817 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
Jason Sams709a0972012-11-15 18:18:04 -0800818
819 float4 sum;
Jason Samsa65de102013-08-09 13:42:28 -0700820 sum.x = f.x * coeff[0] +
821 f.y * coeff[4] +
822 f.z * coeff[8] +
823 f.w * coeff[12];
824 sum.y = f.x * coeff[1] +
825 f.y * coeff[5] +
826 f.z * coeff[9] +
827 f.w * coeff[13];
828 sum.z = f.x * coeff[2] +
829 f.y * coeff[6] +
830 f.z * coeff[10] +
831 f.w * coeff[14];
832 sum.w = f.x * coeff[3] +
833 f.y * coeff[7] +
834 f.z * coeff[11] +
835 f.w * coeff[15];
Jason Sams2b0d8e62013-08-29 16:41:01 -0700836 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
Jason Sams709a0972012-11-15 18:18:04 -0800837
Jason Sams17e3cdc2013-09-09 17:32:16 -0700838 sum.x += add[0];
Simon Hosie0462a392014-03-07 19:36:44 -0800839 sum.y += add[1];
840 sum.z += add[2];
841 sum.w += add[3];
Jason Sams17e3cdc2013-09-09 17:32:16 -0700842
Jason Sams709a0972012-11-15 18:18:04 -0800843
Jason Sams2b0d8e62013-08-29 16:41:01 -0700844 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
Jason Samsa65de102013-08-09 13:42:28 -0700845 if (fout) {
846 switch(vsout) {
847 case 3:
848 case 2:
849 ((float4 *)out)[0] = sum;
850 break;
851 case 1:
852 ((float2 *)out)[0] = sum.xy;
853 break;
854 case 0:
855 ((float *)out)[0] = sum.x;
856 break;
857 }
858 } else {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700859 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
860 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
861 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
862 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
Jason Sams17e3cdc2013-09-09 17:32:16 -0700863
Jason Samsa65de102013-08-09 13:42:28 -0700864 switch(vsout) {
865 case 3:
866 case 2:
867 ((uchar4 *)out)[0] = convert_uchar4(sum);
868 break;
869 case 1:
870 ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
871 break;
872 case 0:
873 ((uchar *)out)[0] = sum.x;
874 break;
875 }
876 }
Jason Sams2b0d8e62013-08-29 16:41:01 -0700877 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
Jason Sams709a0972012-11-15 18:18:04 -0800878}
879
David Grossb0abb142015-03-12 15:23:03 -0700880void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700881 uint32_t xstart, uint32_t xend,
Chris Wailes9ed79102014-07-25 15:53:28 -0700882 uint32_t outstep) {
David Grossb0abb142015-03-12 15:23:03 -0700883 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
Chris Wailesf3712132014-07-16 15:18:30 -0700884
David Grossb0abb142015-03-12 15:23:03 -0700885 uint32_t instep = info->inStride[0];
Chris Wailesf3712132014-07-16 15:18:30 -0700886
David Grossb0abb142015-03-12 15:23:03 -0700887 uchar *out = (uchar *)info->outPtr[0];
888 uchar *in = (uchar *)info->inPtr[0];
Jason Sams709a0972012-11-15 18:18:04 -0800889 uint32_t x1 = xstart;
890 uint32_t x2 = xend;
891
Jason Samsa65de102013-08-09 13:42:28 -0700892 uint32_t vsin = cp->mLastKey.u.inVecSize;
893 uint32_t vsout = cp->mLastKey.u.outVecSize;
894 bool floatIn = !!cp->mLastKey.u.inType;
895 bool floatOut = !!cp->mLastKey.u.outType;
896
David Grossb0abb142015-03-12 15:23:03 -0700897 //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
Jason Sams9e4a96a2013-09-11 15:52:22 -0700898
Jason Sams709a0972012-11-15 18:18:04 -0800899 if(x2 > x1) {
Simon Hosie0462a392014-03-07 19:36:44 -0800900 int32_t len = x2 - x1;
901 if (gArchUseSIMD) {
Chris Wailes44bef6f2014-08-12 13:51:10 -0700902 if((cp->mOptKernel != nullptr) && (len >= 4)) {
Jason Sams858d0352014-04-29 18:10:50 -0700903 // The optimized kernel processes 4 pixels at once
904 // and requires a minimum of 1 chunk of 4
Simon Hosie0462a392014-03-07 19:36:44 -0800905 cp->mOptKernel(out, in, cp->ip, len >> 2);
Jason Sams858d0352014-04-29 18:10:50 -0700906 // Update the len and pointers so the generic code can
907 // finish any leftover pixels
Jason Sams98dd4bb2014-04-29 15:30:30 -0700908 len &= ~3;
Simon Hosie0462a392014-03-07 19:36:44 -0800909 x1 += len;
910 out += outstep * len;
911 in += instep * len;
912 }
Jason Sams32f9d042014-10-22 17:25:51 -0700913#if defined(ARCH_ARM64_USE_INTRINSICS)
Simon Hosie0462a392014-03-07 19:36:44 -0800914 else {
Simon Hosie0462a392014-03-07 19:36:44 -0800915 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
Jason Sams32f9d042014-10-22 17:25:51 -0700916 // Currently this generates off by one errors.
917 //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
918 //x1 += len;
919 //out += outstep * len;
920 //in += instep * len;
Simon Hosie0462a392014-03-07 19:36:44 -0800921 } else {
Simon Hosie6e7e2582014-05-06 01:07:21 -0700922 rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
Jason Sams32f9d042014-10-22 17:25:51 -0700923 x1 += len;
924 out += outstep * len;
925 in += instep * len;
Simon Hosie0462a392014-03-07 19:36:44 -0800926 }
Simon Hosie0462a392014-03-07 19:36:44 -0800927 }
928#endif
Jason Sams709a0972012-11-15 18:18:04 -0800929 }
Jason Sams709a0972012-11-15 18:18:04 -0800930
931 while(x1 != x2) {
David Grossb0abb142015-03-12 15:23:03 -0700932 One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
Jason Sams68c81722013-08-21 16:58:27 -0700933 out += outstep;
934 in += instep;
Jason Sams709a0972012-11-15 18:18:04 -0800935 x1++;
936 }
937 }
938}
939
Chris Wailesf3712132014-07-16 15:18:30 -0700940void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
941 const Allocation ** ains,
942 uint32_t inLen,
943 Allocation * aout,
944 const void * usr,
945 uint32_t usrLen,
946 const RsScriptCall *sc) {
Jason Sams709a0972012-11-15 18:18:04 -0800947
Chris Wailesf3712132014-07-16 15:18:30 -0700948 const Element *ein = ains[0]->mHal.state.type->getElement();
Jason Sams17e3cdc2013-09-09 17:32:16 -0700949 const Element *eout = aout->mHal.state.type->getElement();
950
951 if (ein->getType() == eout->getType()) {
Jason Samsec3cd2d2013-09-11 18:08:47 -0700952 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
953 updateCoeffCache(1.f, 255.f);
954 } else {
955 updateCoeffCache(1.f, 1.f);
956 }
Jason Sams17e3cdc2013-09-09 17:32:16 -0700957 } else {
958 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700959 updateCoeffCache(255.f, 255.f);
Jason Sams17e3cdc2013-09-09 17:32:16 -0700960 } else {
Jason Sams9e4a96a2013-09-11 15:52:22 -0700961 updateCoeffCache(1.f / 255.f, 1.f);
Jason Sams17e3cdc2013-09-09 17:32:16 -0700962 }
963 }
964
Chris Wailesf3712132014-07-16 15:18:30 -0700965 Key_t key = computeKey(ein, eout);
966
Rose, James7b7060c2014-04-22 12:08:06 +0800967#if defined(ARCH_X86_HAVE_SSSE3)
Chris Wailes44bef6f2014-08-12 13:51:10 -0700968 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
Rose, James7b7060c2014-04-22 12:08:06 +0800969 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
970 // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
971 mLastKey = key;
972 }
973
974#else //if !defined(ARCH_X86_HAVE_SSSE3)
Chris Wailes44bef6f2014-08-12 13:51:10 -0700975 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700976 if (mBuf) munmap(mBuf, mBufSize);
Chris Wailes44bef6f2014-08-12 13:51:10 -0700977 mBuf = nullptr;
978 mOptKernel = nullptr;
Jason Sams9b2b9ef2013-07-29 17:38:00 -0700979 if (build(key)) {
980 mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
Jason Sams709a0972012-11-15 18:18:04 -0800981 }
Jason Sams32f9d042014-10-22 17:25:51 -0700982#if defined(ARCH_ARM64_USE_INTRINSICS)
Simon Hosie0462a392014-03-07 19:36:44 -0800983 else {
984 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
985 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
986 uint32_t mm = 0;
987 int i;
988 for (i = 0; i < 4; i++)
989 {
990 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
991 m = ((m * 0x249) >> 9) & 15;
992 m |= ((key.u.addMask >> i) & 1) << 4;
993 mm |= m << (i * 5);
994 }
995
996 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
997 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
998 } else {
999 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1000 }
1001 }
1002#endif
1003 mLastKey = key;
Jason Sams709a0972012-11-15 18:18:04 -08001004 }
Rose, James7b7060c2014-04-22 12:08:06 +08001005#endif //if !defined(ARCH_X86_HAVE_SSSE3)
Jason Sams709a0972012-11-15 18:18:04 -08001006}
1007
Jason Sams709a0972012-11-15 18:18:04 -08001008RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
Jason Samsc905efd2012-11-26 15:20:18 -08001009 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1010 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
Jason Sams709a0972012-11-15 18:18:04 -08001011
Jason Samsa65de102013-08-09 13:42:28 -07001012 mLastKey.key = 0;
Chris Wailes44bef6f2014-08-12 13:51:10 -07001013 mBuf = nullptr;
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001014 mBufSize = 0;
Chris Wailes44bef6f2014-08-12 13:51:10 -07001015 mOptKernel = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -08001016 const static float defaultMatrix[] = {
1017 1.f, 0.f, 0.f, 0.f,
1018 0.f, 1.f, 0.f, 0.f,
1019 0.f, 0.f, 1.f, 0.f,
1020 0.f, 0.f, 0.f, 1.f
1021 };
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001022 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
Jason Sams709a0972012-11-15 18:18:04 -08001023 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001024 setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
Jason Sams709a0972012-11-15 18:18:04 -08001025}
1026
1027RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001028 if (mBuf) munmap(mBuf, mBufSize);
Chris Wailes44bef6f2014-08-12 13:51:10 -07001029 mBuf = nullptr;
1030 mOptKernel = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -08001031}
1032
1033void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
Jason Sams9b2b9ef2013-07-29 17:38:00 -07001034 s->mHal.info.exportedVariableCount = 2;
Jason Sams709a0972012-11-15 18:18:04 -08001035}
1036
Jason Samsc905efd2012-11-26 15:20:18 -08001037RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1038 const Script *s, const Element *e) {
Jason Sams709a0972012-11-15 18:18:04 -08001039
Jason Samsc905efd2012-11-26 15:20:18 -08001040 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
Jason Sams709a0972012-11-15 18:18:04 -08001041}