Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2012 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 17 | #include <sys/mman.h> |
| 18 | #include <unistd.h> |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 19 | |
| 20 | #include "rsCpuIntrinsic.h" |
| 21 | #include "rsCpuIntrinsicInlines.h" |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 22 | #include "linkloader/include/MemChunk.h" |
Narayan Kamath | 72f5f8c | 2014-03-11 12:23:29 +0000 | [diff] [blame] | 23 | #include "linkloader/utils/flush_cpu_cache.h" |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 24 | |
| 25 | #include <sys/mman.h> |
| 26 | #include <stddef.h> |
| 27 | #include <stdint.h> |
| 28 | #include <stdlib.h> |
| 29 | //#include <utils/StopWatch.h> |
| 30 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 31 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 32 | /* uint kernel |
| 33 | * Q0 D0: Load slot for R |
| 34 | * D1: Load slot for G |
| 35 | * Q1 D2: Load slot for B |
| 36 | * D3: Load slot for A |
| 37 | * Q2 D4: Matrix |
| 38 | * D5: = |
| 39 | * Q3 D6: = |
| 40 | * D7: = |
| 41 | * Q4 D8: Add R |
| 42 | * D9: |
| 43 | * Q5 D10: Add G |
| 44 | * D11: |
| 45 | * Q6 D12: Add B |
| 46 | * D13: |
| 47 | * Q7 D14: Add A |
| 48 | * D15: |
| 49 | * Q8 D16: I32: R Sum |
| 50 | * D17: |
| 51 | * Q9 D18: I32: G Sum |
| 52 | * D19: |
| 53 | * Q10 D20: I32: B Sum |
| 54 | * D21: |
| 55 | * Q11 D22: I32: A Sum |
| 56 | * D23: |
| 57 | * Q12 D24: U16: expanded R |
| 58 | * D25: |
| 59 | * Q13 D26: U16: expanded G |
| 60 | * D27: |
| 61 | * Q14 D28: U16: expanded B |
| 62 | * D29: |
| 63 | * Q15 D30: U16: expanded A |
| 64 | * D31: |
| 65 | * |
| 66 | */ |
| 67 | |
| 68 | /* float kernel |
| 69 | * Q0 D0: Load slot for R |
| 70 | * D1: = |
| 71 | * Q1 D2: Load slot for G |
| 72 | * D3: = |
| 73 | * Q2 D4: Load slot for B |
| 74 | * D5: = |
| 75 | * Q3 D6: Load slot for A |
| 76 | * D7: = |
| 77 | * Q4 D8: Matrix |
| 78 | * D9: = |
| 79 | * Q5 D10: = |
| 80 | * D11: = |
| 81 | * Q6 D12: = |
| 82 | * D13: = |
| 83 | * Q7 D14: = |
| 84 | * D15: = |
| 85 | * Q8 D16: Add R |
| 86 | * D17: = |
| 87 | * Q9 D18: Add G |
| 88 | * D19: = |
| 89 | * Q10 D20: Add B |
| 90 | * D21: = |
| 91 | * Q11 D22: Add A |
| 92 | * D23: = |
| 93 | * Q12 D24: Sum R |
| 94 | * D25: = |
| 95 | * Q13 D26: Sum G |
| 96 | * D27: = |
| 97 | * Q14 D28: Sum B |
| 98 | * D29: = |
| 99 | * Q15 D30: Sum A |
| 100 | * D31: = |
| 101 | * |
| 102 | */ |
| 103 | |
| 104 | |
| 105 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 106 | using namespace android; |
| 107 | using namespace android::renderscript; |
| 108 | |
| 109 | namespace android { |
| 110 | namespace renderscript { |
| 111 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 112 | typedef union { |
| 113 | uint64_t key; |
| 114 | struct { |
| 115 | uint32_t inVecSize :2; // [0 - 1] |
| 116 | uint32_t outVecSize :2; // [2 - 3] |
| 117 | uint32_t inType :4; // [4 - 7] |
| 118 | uint32_t outType :4; // [8 - 11] |
| 119 | uint32_t dot :1; // [12] |
| 120 | uint32_t _unused1 :1; // [13] |
| 121 | uint32_t copyAlpha :1; // [14] |
| 122 | uint32_t _unused2 :1; // [15] |
| 123 | uint32_t coeffMask :16; // [16-31] |
| 124 | uint32_t addMask :4; // [32-35] |
| 125 | } u; |
| 126 | } Key_t; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 127 | |
Tim Murray | 6a45ddb | 2014-08-06 11:49:02 -0700 | [diff] [blame] | 128 | //Re-enable when intrinsic is fixed |
| 129 | #if 0 && defined(ARCH_ARM64_USE_INTRINSICS) |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 130 | typedef struct { |
| 131 | void (*column[4])(void); |
| 132 | void (*store)(void); |
| 133 | void (*load)(void); |
Simon Hosie | 6e7e258 | 2014-05-06 01:07:21 -0700 | [diff] [blame] | 134 | void (*store_end)(void); |
| 135 | void (*load_end)(void); |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 136 | } FunctionTab_t; |
| 137 | |
Simon Hosie | 6e7e258 | 2014-05-06 01:07:21 -0700 | [diff] [blame] | 138 | extern "C" void rsdIntrinsicColorMatrix_int_K( |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 139 | void *out, void const *in, size_t count, |
| 140 | FunctionTab_t const *fns, |
| 141 | int16_t const *mult, int32_t const *add); |
| 142 | |
Simon Hosie | 6e7e258 | 2014-05-06 01:07:21 -0700 | [diff] [blame] | 143 | extern "C" void rsdIntrinsicColorMatrix_float_K( |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 144 | void *out, void const *in, size_t count, |
| 145 | FunctionTab_t const *fns, |
| 146 | float const *mult, float const *add); |
| 147 | |
Simon Hosie | 6e7e258 | 2014-05-06 01:07:21 -0700 | [diff] [blame] | 148 | /* The setup functions fill in function tables to be used by above functions; |
| 149 | * this code also eliminates jump-to-another-jump cases by short-circuiting |
| 150 | * empty functions. While it's not performance critical, it works out easier |
| 151 | * to write the set-up code in assembly than to try to expose the same symbols |
| 152 | * and write the code in C. |
| 153 | */ |
| 154 | extern "C" void rsdIntrinsicColorMatrixSetup_int_K( |
| 155 | FunctionTab_t *fns, |
| 156 | uint32_t mask, int dt, int st); |
| 157 | |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 158 | extern "C" void rsdIntrinsicColorMatrixSetup_float_K( |
Simon Hosie | 6e7e258 | 2014-05-06 01:07:21 -0700 | [diff] [blame] | 159 | FunctionTab_t *fns, |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 160 | uint32_t mask, int dt, int st); |
| 161 | #endif |
| 162 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 163 | class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic { |
| 164 | public: |
| 165 | virtual void populateScript(Script *); |
| 166 | |
| 167 | virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); |
| 168 | |
| 169 | virtual ~RsdCpuScriptIntrinsicColorMatrix(); |
Jason Sams | c905efd | 2012-11-26 15:20:18 -0800 | [diff] [blame] | 170 | RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 171 | |
Chris Wailes | f371213 | 2014-07-16 15:18:30 -0700 | [diff] [blame] | 172 | virtual void preLaunch(uint32_t slot, const Allocation ** ains, |
| 173 | uint32_t inLen, Allocation * aout, const void * usr, |
| 174 | uint32_t usrLen, const RsScriptCall *sc); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 175 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 176 | protected: |
| 177 | float fp[16]; |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 178 | float fpa[4]; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 179 | |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 180 | // The following four fields are read as constants |
| 181 | // by the SIMD assembly code. |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 182 | short ip[16]; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 183 | int ipa[4]; |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 184 | float tmpFp[16]; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 185 | float tmpFpa[4]; |
Tim Murray | 6a45ddb | 2014-08-06 11:49:02 -0700 | [diff] [blame] | 186 | #if 0 && defined(ARCH_ARM64_USE_INTRINSICS) |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 187 | FunctionTab_t mFnTab; |
| 188 | #endif |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 189 | |
Chris Wailes | 80ef693 | 2014-07-08 11:22:18 -0700 | [diff] [blame] | 190 | static void kernel(const RsExpandKernelParams *p, |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 191 | uint32_t xstart, uint32_t xend, |
Chris Wailes | 9ed7910 | 2014-07-25 15:53:28 -0700 | [diff] [blame] | 192 | uint32_t outstep); |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 193 | void updateCoeffCache(float fpMul, float addMul); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 194 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 195 | Key_t mLastKey; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 196 | unsigned char *mBuf; |
| 197 | size_t mBufSize; |
| 198 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 199 | Key_t computeKey(const Element *ein, const Element *eout); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 200 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 201 | bool build(Key_t key); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 202 | |
| 203 | void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count); |
| 204 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 205 | }; |
| 206 | |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 211 | Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey( |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 212 | const Element *ein, const Element *eout) { |
| 213 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 214 | Key_t key; |
| 215 | key.key = 0; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 216 | |
| 217 | // Compute a unique code key for this operation |
| 218 | |
| 219 | // Add to the key the input and output types |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 220 | bool hasFloat = false; |
| 221 | if (ein->getType() == RS_TYPE_FLOAT_32) { |
| 222 | hasFloat = true; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 223 | key.u.inType = RS_TYPE_FLOAT_32; |
| 224 | rsAssert(key.u.inType == RS_TYPE_FLOAT_32); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 225 | } |
| 226 | if (eout->getType() == RS_TYPE_FLOAT_32) { |
| 227 | hasFloat = true; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 228 | key.u.outType = RS_TYPE_FLOAT_32; |
| 229 | rsAssert(key.u.outType == RS_TYPE_FLOAT_32); |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 230 | } |
| 231 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 232 | // Mask in the bits indicating which coefficients in the |
| 233 | // color matrix are needed. |
| 234 | if (hasFloat) { |
| 235 | for (uint32_t i=0; i < 16; i++) { |
| 236 | if (fabs(fp[i]) != 0.f) { |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 237 | key.u.coeffMask |= 1 << i; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 238 | } |
| 239 | } |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 240 | if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1; |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 241 | if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2; |
| 242 | if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4; |
| 243 | if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 244 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 245 | } else { |
| 246 | for (uint32_t i=0; i < 16; i++) { |
| 247 | if (ip[i] != 0) { |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 248 | key.u.coeffMask |= 1 << i; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 249 | } |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 250 | } |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 251 | if (ipa[0] != 0) key.u.addMask |= 0x1; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 252 | if (ipa[1] != 0) key.u.addMask |= 0x2; |
| 253 | if (ipa[2] != 0) key.u.addMask |= 0x4; |
| 254 | if (ipa[3] != 0) key.u.addMask |= 0x8; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 255 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 256 | |
| 257 | // Look for a dot product where the r,g,b colums are the same |
| 258 | if ((ip[0] == ip[1]) && (ip[0] == ip[2]) && |
| 259 | (ip[4] == ip[5]) && (ip[4] == ip[6]) && |
| 260 | (ip[8] == ip[9]) && (ip[8] == ip[10]) && |
| 261 | (ip[12] == ip[13]) && (ip[12] == ip[14])) { |
| 262 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 263 | if (!key.u.addMask) key.u.dot = 1; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 264 | } |
| 265 | |
| 266 | // Is alpha a simple copy |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 267 | if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) { |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 268 | key.u.copyAlpha = !(key.u.inType || key.u.outType); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 269 | } |
| 270 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 271 | //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); |
| 272 | |
| 273 | switch (ein->getVectorSize()) { |
| 274 | case 4: |
| 275 | key.u.inVecSize = 3; |
| 276 | break; |
| 277 | case 3: |
| 278 | key.u.inVecSize = 2; |
| 279 | key.u.coeffMask &= ~0xF000; |
| 280 | break; |
| 281 | case 2: |
| 282 | key.u.inVecSize = 1; |
| 283 | key.u.coeffMask &= ~0xFF00; |
| 284 | break; |
| 285 | default: |
| 286 | key.u.coeffMask &= ~0xFFF0; |
| 287 | break; |
| 288 | } |
| 289 | |
| 290 | switch (eout->getVectorSize()) { |
| 291 | case 4: |
| 292 | key.u.outVecSize = 3; |
| 293 | break; |
| 294 | case 3: |
| 295 | key.u.outVecSize = 2; |
| 296 | key.u.coeffMask &= ~0x8888; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 297 | key.u.addMask &= 7; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 298 | break; |
| 299 | case 2: |
| 300 | key.u.outVecSize = 1; |
| 301 | key.u.coeffMask &= ~0xCCCC; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 302 | key.u.addMask &= 3; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 303 | break; |
| 304 | default: |
| 305 | key.u.coeffMask &= ~0xEEEE; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 306 | key.u.addMask &= 1; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 307 | break; |
| 308 | } |
| 309 | |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 310 | if (key.u.inType && !key.u.outType) { |
| 311 | key.u.addMask |= 1; |
| 312 | if (key.u.outVecSize > 0) key.u.addMask |= 2; |
| 313 | if (key.u.outVecSize > 1) key.u.addMask |= 4; |
| 314 | if (key.u.outVecSize > 2) key.u.addMask |= 8; |
| 315 | } |
| 316 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 317 | //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 318 | return key; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 319 | } |
| 320 | |
Jason Sams | 074424a | 2014-05-22 13:30:03 -0700 | [diff] [blame] | 321 | #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 322 | |
| 323 | #define DEF_SYM(x) \ |
| 324 | extern "C" uint32_t _N_ColorMatrix_##x; \ |
| 325 | extern "C" uint32_t _N_ColorMatrix_##x##_end; \ |
| 326 | extern "C" uint32_t _N_ColorMatrix_##x##_len; |
| 327 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 328 | DEF_SYM(prefix_i) |
| 329 | DEF_SYM(prefix_f) |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 330 | DEF_SYM(postfix1) |
| 331 | DEF_SYM(postfix2) |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 332 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 333 | DEF_SYM(load_u8_4) |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 334 | DEF_SYM(load_u8_3) |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 335 | DEF_SYM(load_u8_2) |
| 336 | DEF_SYM(load_u8_1) |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 337 | DEF_SYM(load_u8f_4) |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 338 | DEF_SYM(load_u8f_3) |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 339 | DEF_SYM(load_u8f_2) |
| 340 | DEF_SYM(load_u8f_1) |
| 341 | DEF_SYM(load_f32_4) |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 342 | DEF_SYM(load_f32_3) |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 343 | DEF_SYM(load_f32_2) |
| 344 | DEF_SYM(load_f32_1) |
| 345 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 346 | DEF_SYM(store_u8_4) |
| 347 | DEF_SYM(store_u8_2) |
| 348 | DEF_SYM(store_u8_1) |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 349 | DEF_SYM(store_f32_4) |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 350 | DEF_SYM(store_f32_3) |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 351 | DEF_SYM(store_f32_2) |
| 352 | DEF_SYM(store_f32_1) |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 353 | DEF_SYM(store_f32u_4) |
| 354 | DEF_SYM(store_f32u_2) |
| 355 | DEF_SYM(store_f32u_1) |
| 356 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 357 | DEF_SYM(unpack_u8_4) |
| 358 | DEF_SYM(unpack_u8_3) |
| 359 | DEF_SYM(unpack_u8_2) |
| 360 | DEF_SYM(unpack_u8_1) |
| 361 | DEF_SYM(pack_u8_4) |
| 362 | DEF_SYM(pack_u8_3) |
| 363 | DEF_SYM(pack_u8_2) |
| 364 | DEF_SYM(pack_u8_1) |
| 365 | DEF_SYM(dot) |
| 366 | DEF_SYM(add_0_u8) |
| 367 | DEF_SYM(add_1_u8) |
| 368 | DEF_SYM(add_2_u8) |
| 369 | DEF_SYM(add_3_u8) |
| 370 | |
| 371 | #define ADD_CHUNK(x) \ |
| 372 | memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \ |
| 373 | buf += _N_ColorMatrix_##x##_len |
| 374 | |
| 375 | |
| 376 | static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) { |
| 377 | size_t off = (target - buf - 8) >> 2; |
| 378 | rsAssert(((off & 0xff000000) == 0) || |
| 379 | ((off & 0xff000000) == 0xff000000)); |
| 380 | |
| 381 | uint32_t op = (condition << 28); |
| 382 | op |= 0xa << 24; // branch |
| 383 | op |= 0xffffff & off; |
| 384 | ((uint32_t *)buf)[0] = op; |
| 385 | return buf + 4; |
| 386 | } |
| 387 | |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 388 | static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) { |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 389 | rsAssert(vd < 32); |
| 390 | rsAssert(vm < 32); |
| 391 | rsAssert(vn < 32); |
| 392 | |
| 393 | uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22); |
| 394 | op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5); |
| 395 | op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 396 | return op; |
| 397 | } |
| 398 | |
| 399 | static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { |
| 400 | //vmlal.s16 Q#1, D#1, D#2[#] |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 401 | uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 402 | ((uint32_t *)buf)[0] = op; |
| 403 | return buf + 4; |
| 404 | } |
| 405 | |
| 406 | static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { |
| 407 | //vmull.s16 Q#1, D#1, D#2[#] |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 408 | uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 409 | ((uint32_t *)buf)[0] = op; |
| 410 | return buf + 4; |
| 411 | } |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 412 | |
| 413 | static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { |
Simon Hosie | c7c255e | 2014-03-07 16:23:12 -0800 | [diff] [blame] | 414 | //vqadd.s32 Q#1, Q#1, Q#2 |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 415 | uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 416 | ((uint32_t *)buf)[0] = op; |
| 417 | return buf + 4; |
| 418 | } |
| 419 | |
| 420 | static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { |
| 421 | //vmlal.f32 Q#1, D#1, D#2[#] |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 422 | uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 423 | ((uint32_t *)buf)[0] = op; |
| 424 | return buf + 4; |
| 425 | } |
| 426 | |
| 427 | static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { |
| 428 | //vmull.f32 Q#1, D#1, D#2[#] |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 429 | uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); |
| 430 | ((uint32_t *)buf)[0] = op; |
| 431 | return buf + 4; |
| 432 | } |
| 433 | |
| 434 | static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { |
| 435 | //vadd.f32 Q#1, D#1, D#2 |
| 436 | uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 437 | ((uint32_t *)buf)[0] = op; |
| 438 | return buf + 4; |
| 439 | } |
| 440 | |
Simon Hosie | c7c255e | 2014-03-07 16:23:12 -0800 | [diff] [blame] | 441 | static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) { |
| 442 | //vmov.32 Q#1, #imm |
| 443 | rsAssert(imm == 0); |
| 444 | uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0); |
| 445 | ((uint32_t *)buf)[0] = op; |
| 446 | return buf + 4; |
| 447 | } |
| 448 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 449 | static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { |
| 450 | //vadd.f32 Q#1, D#1, D#2 |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 451 | uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 452 | ((uint32_t *)buf)[0] = op; |
| 453 | return buf + 4; |
| 454 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 455 | #endif |
| 456 | |
Rose, James | 7b7060c | 2014-04-22 12:08:06 +0800 | [diff] [blame] | 457 | #if defined(ARCH_X86_HAVE_SSSE3) |
| 458 | extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, |
| 459 | const short *coef, uint32_t count); |
| 460 | extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, |
| 461 | const short *coef, uint32_t count); |
| 462 | extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, |
| 463 | const short *coef, uint32_t count); |
| 464 | |
| 465 | void * selectKernel(Key_t key) |
| 466 | { |
| 467 | void * kernel = NULL; |
| 468 | |
| 469 | // inType, outType float if nonzero |
| 470 | if (!(key.u.inType || key.u.outType)) { |
| 471 | if (key.u.dot) |
| 472 | kernel = (void *)rsdIntrinsicColorMatrixDot_K; |
| 473 | else if (key.u.copyAlpha) |
| 474 | kernel = (void *)rsdIntrinsicColorMatrix3x3_K; |
| 475 | else |
| 476 | kernel = (void *)rsdIntrinsicColorMatrix4x4_K; |
| 477 | } |
| 478 | |
| 479 | return kernel; |
| 480 | } |
| 481 | #endif |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 482 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 483 | bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) { |
Jason Sams | 074424a | 2014-05-22 13:30:03 -0700 | [diff] [blame] | 484 | #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 485 | mBufSize = 4096; |
| 486 | //StopWatch build_time("rs cm: build time"); |
| 487 | mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE, |
| 488 | MAP_PRIVATE | MAP_ANON, -1, 0); |
| 489 | if (!mBuf) { |
| 490 | return false; |
| 491 | } |
| 492 | |
| 493 | uint8_t *buf = mBuf; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 494 | uint8_t *buf2 = NULL; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 495 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 496 | int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final |
| 497 | int opInit[4] = {0, 0, 0, 0}; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 498 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 499 | memset(ops, 0, sizeof(ops)); |
| 500 | for (int i=0; i < 4; i++) { |
| 501 | if (key.u.coeffMask & (1 << (i*4))) { |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 502 | ops[i][0] = 0x2 | opInit[0]; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 503 | opInit[0] = 1; |
| 504 | } |
| 505 | if (!key.u.dot) { |
| 506 | if (key.u.coeffMask & (1 << (1 + i*4))) { |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 507 | ops[i][1] = 0x2 | opInit[1]; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 508 | opInit[1] = 1; |
| 509 | } |
| 510 | if (key.u.coeffMask & (1 << (2 + i*4))) { |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 511 | ops[i][2] = 0x2 | opInit[2]; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 512 | opInit[2] = 1; |
| 513 | } |
| 514 | } |
| 515 | if (!key.u.copyAlpha) { |
| 516 | if (key.u.coeffMask & (1 << (3 + i*4))) { |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 517 | ops[i][3] = 0x2 | opInit[3]; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 518 | opInit[3] = 1; |
| 519 | } |
| 520 | } |
| 521 | } |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 522 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 523 | if (key.u.inType || key.u.outType) { |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 524 | key.u.copyAlpha = 0; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 525 | ADD_CHUNK(prefix_f); |
| 526 | buf2 = buf; |
| 527 | |
| 528 | // Load the incoming r,g,b,a as needed |
| 529 | if (key.u.inType) { |
| 530 | switch(key.u.inVecSize) { |
| 531 | case 3: |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 532 | ADD_CHUNK(load_f32_4); |
| 533 | break; |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 534 | case 2: |
| 535 | ADD_CHUNK(load_f32_3); |
| 536 | break; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 537 | case 1: |
| 538 | ADD_CHUNK(load_f32_2); |
| 539 | break; |
| 540 | case 0: |
| 541 | ADD_CHUNK(load_f32_1); |
| 542 | break; |
| 543 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 544 | } else { |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 545 | switch(key.u.inVecSize) { |
| 546 | case 3: |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 547 | ADD_CHUNK(load_u8f_4); |
| 548 | break; |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 549 | case 2: |
| 550 | ADD_CHUNK(load_u8f_3); |
| 551 | break; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 552 | case 1: |
| 553 | ADD_CHUNK(load_u8f_2); |
| 554 | break; |
| 555 | case 0: |
| 556 | ADD_CHUNK(load_u8f_1); |
| 557 | break; |
| 558 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 559 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 560 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 561 | for (int i=0; i < 4; i++) { |
| 562 | for (int j=0; j < 4; j++) { |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 563 | switch(ops[i][j]) { |
| 564 | case 0: |
| 565 | break; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 566 | case 2: |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 567 | buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); |
| 568 | break; |
| 569 | case 3: |
| 570 | buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 571 | break; |
| 572 | } |
| 573 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 574 | } |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 575 | for (int j=0; j < 4; j++) { |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 576 | if (opInit[j]) { |
| 577 | if (key.u.addMask & (1 << j)) { |
| 578 | buf = addVADD_F32(buf, j, 12+j, 8+j); |
| 579 | } else { |
| 580 | buf = addVORR_32(buf, j, 12+j, 12+j); |
| 581 | } |
| 582 | } else { |
| 583 | if (key.u.addMask & (1 << j)) { |
Simon Hosie | c7c255e | 2014-03-07 16:23:12 -0800 | [diff] [blame] | 584 | buf = addVORR_32(buf, j, 8+j, 8+j); |
| 585 | } else { |
| 586 | buf = addVMOV_32(buf, j, 0); |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 587 | } |
| 588 | } |
| 589 | } |
| 590 | |
| 591 | if (key.u.outType) { |
| 592 | switch(key.u.outVecSize) { |
| 593 | case 3: |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 594 | ADD_CHUNK(store_f32_4); |
| 595 | break; |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 596 | case 2: |
| 597 | ADD_CHUNK(store_f32_3); |
| 598 | break; |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 599 | case 1: |
| 600 | ADD_CHUNK(store_f32_2); |
| 601 | break; |
| 602 | case 0: |
| 603 | ADD_CHUNK(store_f32_1); |
| 604 | break; |
| 605 | } |
| 606 | } else { |
| 607 | switch(key.u.outVecSize) { |
| 608 | case 3: |
| 609 | case 2: |
| 610 | ADD_CHUNK(store_f32u_4); |
| 611 | break; |
| 612 | case 1: |
| 613 | ADD_CHUNK(store_f32u_2); |
| 614 | break; |
| 615 | case 0: |
| 616 | ADD_CHUNK(store_f32u_1); |
| 617 | break; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 618 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 619 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 620 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 621 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 622 | } else { |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 623 | // Add the function prefix |
| 624 | // Store the address for the loop return |
| 625 | ADD_CHUNK(prefix_i); |
| 626 | buf2 = buf; |
| 627 | |
| 628 | // Load the incoming r,g,b,a as needed |
| 629 | switch(key.u.inVecSize) { |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 630 | case 3: |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 631 | ADD_CHUNK(load_u8_4); |
| 632 | if (key.u.copyAlpha) { |
| 633 | ADD_CHUNK(unpack_u8_3); |
| 634 | } else { |
| 635 | ADD_CHUNK(unpack_u8_4); |
| 636 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 637 | break; |
| 638 | case 2: |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 639 | ADD_CHUNK(load_u8_3); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 640 | ADD_CHUNK(unpack_u8_3); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 641 | break; |
| 642 | case 1: |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 643 | ADD_CHUNK(load_u8_2); |
| 644 | ADD_CHUNK(unpack_u8_2); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 645 | break; |
| 646 | case 0: |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 647 | ADD_CHUNK(load_u8_1); |
| 648 | ADD_CHUNK(unpack_u8_1); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 649 | break; |
| 650 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 651 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 652 | // Add multiply and accumulate |
| 653 | // use MULL to init the output register, |
| 654 | // use MLAL from there |
| 655 | for (int i=0; i < 4; i++) { |
| 656 | for (int j=0; j < 4; j++) { |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 657 | switch(ops[i][j]) { |
| 658 | case 0: |
| 659 | break; |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 660 | case 2: |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 661 | buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j); |
| 662 | break; |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 663 | case 3: |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 664 | buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j); |
| 665 | break; |
| 666 | } |
| 667 | } |
| 668 | } |
| 669 | for (int j=0; j < 4; j++) { |
Jason Sams | ec3cd2d | 2013-09-11 18:08:47 -0700 | [diff] [blame] | 670 | if (opInit[j]) { |
| 671 | if (key.u.addMask & (1 << j)) { |
| 672 | buf = addVQADD_S32(buf, 8+j, 8+j, 4+j); |
| 673 | } |
| 674 | } else { |
| 675 | if (key.u.addMask & (1 << j)) { |
Simon Hosie | c7c255e | 2014-03-07 16:23:12 -0800 | [diff] [blame] | 676 | buf = addVORR_32(buf, 8+j, 4+j, 4+j); |
Jason Sams | ec3cd2d | 2013-09-11 18:08:47 -0700 | [diff] [blame] | 677 | } |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 678 | } |
| 679 | } |
| 680 | |
| 681 | // If we have a dot product, perform the special pack. |
| 682 | if (key.u.dot) { |
| 683 | ADD_CHUNK(pack_u8_1); |
| 684 | ADD_CHUNK(dot); |
| 685 | } else { |
| 686 | switch(key.u.outVecSize) { |
| 687 | case 3: |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 688 | if (key.u.copyAlpha) { |
| 689 | ADD_CHUNK(pack_u8_3); |
| 690 | } else { |
| 691 | ADD_CHUNK(pack_u8_4); |
| 692 | } |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 693 | break; |
| 694 | case 2: |
| 695 | ADD_CHUNK(pack_u8_3); |
| 696 | break; |
| 697 | case 1: |
| 698 | ADD_CHUNK(pack_u8_2); |
| 699 | break; |
| 700 | case 0: |
| 701 | ADD_CHUNK(pack_u8_1); |
| 702 | break; |
| 703 | } |
| 704 | } |
| 705 | |
| 706 | // Write out result |
| 707 | switch(key.u.outVecSize) { |
| 708 | case 3: |
| 709 | case 2: |
| 710 | ADD_CHUNK(store_u8_4); |
| 711 | break; |
| 712 | case 1: |
| 713 | ADD_CHUNK(store_u8_2); |
| 714 | break; |
| 715 | case 0: |
| 716 | ADD_CHUNK(store_u8_1); |
| 717 | break; |
| 718 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 719 | } |
| 720 | |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 721 | if (key.u.inType != key.u.outType) { |
| 722 | key.u.copyAlpha = 0; |
| 723 | key.u.dot = 0; |
| 724 | } |
| 725 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 726 | // Loop, branch, and cleanup |
| 727 | ADD_CHUNK(postfix1); |
| 728 | buf = addBranch(buf, buf2, 0x01); |
| 729 | ADD_CHUNK(postfix2); |
| 730 | |
| 731 | int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC); |
| 732 | if (ret == -1) { |
| 733 | ALOGE("mprotect error %i", ret); |
| 734 | return false; |
| 735 | } |
| 736 | |
Narayan Kamath | 72f5f8c | 2014-03-11 12:23:29 +0000 | [diff] [blame] | 737 | FLUSH_CPU_CACHE(mBuf, (char*) mBuf + mBufSize); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 738 | return true; |
| 739 | #else |
| 740 | return false; |
| 741 | #endif |
| 742 | } |
| 743 | |
Jason Sams | ec3cd2d | 2013-09-11 18:08:47 -0700 | [diff] [blame] | 744 | void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) { |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 745 | for(int ct=0; ct < 16; ct++) { |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 746 | ip[ct] = (short)(fp[ct] * 256.f + 0.5f); |
| 747 | tmpFp[ct] = fp[ct] * fpMul; |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 748 | //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]); |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 749 | } |
| 750 | |
Jason Sams | ec3cd2d | 2013-09-11 18:08:47 -0700 | [diff] [blame] | 751 | float add = 0.f; |
| 752 | if (fpMul > 254.f) add = 0.5f; |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 753 | for(int ct=0; ct < 4; ct++) { |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 754 | tmpFpa[ct] = fpa[ct] * addMul + add; |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 755 | //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]); |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 756 | } |
| 757 | |
Jason Sams | ec3cd2d | 2013-09-11 18:08:47 -0700 | [diff] [blame] | 758 | for(int ct=0; ct < 4; ct++) { |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 759 | ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f); |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 760 | } |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 761 | } |
| 762 | |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 763 | void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data, |
| 764 | size_t dataLength) { |
| 765 | switch(slot) { |
| 766 | case 0: |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 767 | memcpy (fp, data, sizeof(fp)); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 768 | break; |
| 769 | case 1: |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 770 | memcpy (fpa, data, sizeof(fpa)); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 771 | break; |
| 772 | default: |
| 773 | rsAssert(0); |
| 774 | break; |
| 775 | } |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 776 | mRootPtr = &kernel; |
| 777 | } |
| 778 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 779 | |
Chris Wailes | 80ef693 | 2014-07-08 11:22:18 -0700 | [diff] [blame] | 780 | static void One(const RsExpandKernelParams *p, void *out, |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 781 | const void *py, const float* coeff, const float *add, |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 782 | uint32_t vsin, uint32_t vsout, bool fin, bool fout) { |
| 783 | |
| 784 | float4 f = 0.f; |
| 785 | if (fin) { |
| 786 | switch(vsin) { |
| 787 | case 3: |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 788 | f = ((const float4 *)py)[0]; |
| 789 | break; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 790 | case 2: |
| 791 | f = ((const float4 *)py)[0]; |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 792 | f.w = 0.f; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 793 | break; |
| 794 | case 1: |
| 795 | f.xy = ((const float2 *)py)[0]; |
| 796 | break; |
| 797 | case 0: |
| 798 | f.x = ((const float *)py)[0]; |
| 799 | break; |
| 800 | } |
| 801 | } else { |
| 802 | switch(vsin) { |
| 803 | case 3: |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 804 | f = convert_float4(((const uchar4 *)py)[0]); |
| 805 | break; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 806 | case 2: |
| 807 | f = convert_float4(((const uchar4 *)py)[0]); |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 808 | f.w = 0.f; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 809 | break; |
| 810 | case 1: |
Jason Sams | 68c8172 | 2013-08-21 16:58:27 -0700 | [diff] [blame] | 811 | f.xy = convert_float2(((const uchar2 *)py)[0]); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 812 | break; |
| 813 | case 0: |
Jason Sams | 68c8172 | 2013-08-21 16:58:27 -0700 | [diff] [blame] | 814 | f.x = (float)(((const uchar *)py)[0]); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 815 | break; |
| 816 | } |
| 817 | } |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 818 | //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w); |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 819 | |
| 820 | float4 sum; |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 821 | sum.x = f.x * coeff[0] + |
| 822 | f.y * coeff[4] + |
| 823 | f.z * coeff[8] + |
| 824 | f.w * coeff[12]; |
| 825 | sum.y = f.x * coeff[1] + |
| 826 | f.y * coeff[5] + |
| 827 | f.z * coeff[9] + |
| 828 | f.w * coeff[13]; |
| 829 | sum.z = f.x * coeff[2] + |
| 830 | f.y * coeff[6] + |
| 831 | f.z * coeff[10] + |
| 832 | f.w * coeff[14]; |
| 833 | sum.w = f.x * coeff[3] + |
| 834 | f.y * coeff[7] + |
| 835 | f.z * coeff[11] + |
| 836 | f.w * coeff[15]; |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 837 | //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w); |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 838 | |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 839 | sum.x += add[0]; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 840 | sum.y += add[1]; |
| 841 | sum.z += add[2]; |
| 842 | sum.w += add[3]; |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 843 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 844 | |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 845 | //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w); |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 846 | if (fout) { |
| 847 | switch(vsout) { |
| 848 | case 3: |
| 849 | case 2: |
| 850 | ((float4 *)out)[0] = sum; |
| 851 | break; |
| 852 | case 1: |
| 853 | ((float2 *)out)[0] = sum.xy; |
| 854 | break; |
| 855 | case 0: |
| 856 | ((float *)out)[0] = sum.x; |
| 857 | break; |
| 858 | } |
| 859 | } else { |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 860 | sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x); |
| 861 | sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y); |
| 862 | sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z); |
| 863 | sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w); |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 864 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 865 | switch(vsout) { |
| 866 | case 3: |
| 867 | case 2: |
| 868 | ((uchar4 *)out)[0] = convert_uchar4(sum); |
| 869 | break; |
| 870 | case 1: |
| 871 | ((uchar2 *)out)[0] = convert_uchar2(sum.xy); |
| 872 | break; |
| 873 | case 0: |
| 874 | ((uchar *)out)[0] = sum.x; |
| 875 | break; |
| 876 | } |
| 877 | } |
Jason Sams | 2b0d8e6 | 2013-08-29 16:41:01 -0700 | [diff] [blame] | 878 | //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]); |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 879 | } |
| 880 | |
Chris Wailes | 80ef693 | 2014-07-08 11:22:18 -0700 | [diff] [blame] | 881 | void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p, |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 882 | uint32_t xstart, uint32_t xend, |
Chris Wailes | 9ed7910 | 2014-07-25 15:53:28 -0700 | [diff] [blame] | 883 | uint32_t outstep) { |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 884 | RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr; |
Chris Wailes | f371213 | 2014-07-16 15:18:30 -0700 | [diff] [blame] | 885 | |
Chris Wailes | 9ed7910 | 2014-07-25 15:53:28 -0700 | [diff] [blame] | 886 | uint32_t instep = p->inEStrides[0]; |
Chris Wailes | f371213 | 2014-07-16 15:18:30 -0700 | [diff] [blame] | 887 | |
| 888 | uchar *out = (uchar *)p->out + outstep * xstart; |
| 889 | uchar *in = (uchar *)p->ins[0] + instep * xstart; |
| 890 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 891 | uint32_t x1 = xstart; |
| 892 | uint32_t x2 = xend; |
| 893 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 894 | uint32_t vsin = cp->mLastKey.u.inVecSize; |
| 895 | uint32_t vsout = cp->mLastKey.u.outVecSize; |
| 896 | bool floatIn = !!cp->mLastKey.u.inType; |
| 897 | bool floatOut = !!cp->mLastKey.u.outType; |
| 898 | |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 899 | //if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout); |
| 900 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 901 | if(x2 > x1) { |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 902 | int32_t len = x2 - x1; |
| 903 | if (gArchUseSIMD) { |
| 904 | if((cp->mOptKernel != NULL) && (len >= 4)) { |
Jason Sams | 858d035 | 2014-04-29 18:10:50 -0700 | [diff] [blame] | 905 | // The optimized kernel processes 4 pixels at once |
| 906 | // and requires a minimum of 1 chunk of 4 |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 907 | cp->mOptKernel(out, in, cp->ip, len >> 2); |
Jason Sams | 858d035 | 2014-04-29 18:10:50 -0700 | [diff] [blame] | 908 | // Update the len and pointers so the generic code can |
| 909 | // finish any leftover pixels |
Jason Sams | 98dd4bb | 2014-04-29 15:30:30 -0700 | [diff] [blame] | 910 | len &= ~3; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 911 | x1 += len; |
| 912 | out += outstep * len; |
| 913 | in += instep * len; |
| 914 | } |
Tim Murray | 6a45ddb | 2014-08-06 11:49:02 -0700 | [diff] [blame] | 915 | #if 0 && defined(ARCH_ARM64_USE_INTRINSICS) |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 916 | else { |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 917 | if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) { |
Simon Hosie | 6e7e258 | 2014-05-06 01:07:21 -0700 | [diff] [blame] | 918 | rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa); |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 919 | } else { |
Simon Hosie | 6e7e258 | 2014-05-06 01:07:21 -0700 | [diff] [blame] | 920 | rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa); |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 921 | } |
Simon Hosie | 6e7e258 | 2014-05-06 01:07:21 -0700 | [diff] [blame] | 922 | x1 += len; |
| 923 | out += outstep * len; |
| 924 | in += instep * len; |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 925 | } |
| 926 | #endif |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 927 | } |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 928 | |
| 929 | while(x1 != x2) { |
Jason Sams | ec3cd2d | 2013-09-11 18:08:47 -0700 | [diff] [blame] | 930 | One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut); |
Jason Sams | 68c8172 | 2013-08-21 16:58:27 -0700 | [diff] [blame] | 931 | out += outstep; |
| 932 | in += instep; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 933 | x1++; |
| 934 | } |
| 935 | } |
| 936 | } |
| 937 | |
Chris Wailes | f371213 | 2014-07-16 15:18:30 -0700 | [diff] [blame] | 938 | void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot, |
| 939 | const Allocation ** ains, |
| 940 | uint32_t inLen, |
| 941 | Allocation * aout, |
| 942 | const void * usr, |
| 943 | uint32_t usrLen, |
| 944 | const RsScriptCall *sc) { |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 945 | |
Chris Wailes | f371213 | 2014-07-16 15:18:30 -0700 | [diff] [blame] | 946 | const Element *ein = ains[0]->mHal.state.type->getElement(); |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 947 | const Element *eout = aout->mHal.state.type->getElement(); |
| 948 | |
| 949 | if (ein->getType() == eout->getType()) { |
Jason Sams | ec3cd2d | 2013-09-11 18:08:47 -0700 | [diff] [blame] | 950 | if (eout->getType() == RS_TYPE_UNSIGNED_8) { |
| 951 | updateCoeffCache(1.f, 255.f); |
| 952 | } else { |
| 953 | updateCoeffCache(1.f, 1.f); |
| 954 | } |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 955 | } else { |
| 956 | if (eout->getType() == RS_TYPE_UNSIGNED_8) { |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 957 | updateCoeffCache(255.f, 255.f); |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 958 | } else { |
Jason Sams | 9e4a96a | 2013-09-11 15:52:22 -0700 | [diff] [blame] | 959 | updateCoeffCache(1.f / 255.f, 1.f); |
Jason Sams | 17e3cdc | 2013-09-09 17:32:16 -0700 | [diff] [blame] | 960 | } |
| 961 | } |
| 962 | |
Chris Wailes | f371213 | 2014-07-16 15:18:30 -0700 | [diff] [blame] | 963 | Key_t key = computeKey(ein, eout); |
| 964 | |
Rose, James | 7b7060c | 2014-04-22 12:08:06 +0800 | [diff] [blame] | 965 | #if defined(ARCH_X86_HAVE_SSSE3) |
| 966 | if ((mOptKernel == NULL) || (mLastKey.key != key.key)) { |
| 967 | // FIXME: Disable mOptKernel to pass RS color matrix CTS cases |
| 968 | // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key); |
| 969 | mLastKey = key; |
| 970 | } |
| 971 | |
| 972 | #else //if !defined(ARCH_X86_HAVE_SSSE3) |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 973 | if ((mOptKernel == NULL) || (mLastKey.key != key.key)) { |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 974 | if (mBuf) munmap(mBuf, mBufSize); |
| 975 | mBuf = NULL; |
| 976 | mOptKernel = NULL; |
| 977 | if (build(key)) { |
| 978 | mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 979 | } |
Tim Murray | 6a45ddb | 2014-08-06 11:49:02 -0700 | [diff] [blame] | 980 | #if 0 && defined(ARCH_ARM64_USE_INTRINSICS) |
Simon Hosie | 0462a39 | 2014-03-07 19:36:44 -0800 | [diff] [blame] | 981 | else { |
| 982 | int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0); |
| 983 | int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0); |
| 984 | uint32_t mm = 0; |
| 985 | int i; |
| 986 | for (i = 0; i < 4; i++) |
| 987 | { |
| 988 | uint32_t m = (key.u.coeffMask >> i) & 0x1111; |
| 989 | m = ((m * 0x249) >> 9) & 15; |
| 990 | m |= ((key.u.addMask >> i) & 1) << 4; |
| 991 | mm |= m << (i * 5); |
| 992 | } |
| 993 | |
| 994 | if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) { |
| 995 | rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st); |
| 996 | } else { |
| 997 | rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st); |
| 998 | } |
| 999 | } |
| 1000 | #endif |
| 1001 | mLastKey = key; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1002 | } |
Rose, James | 7b7060c | 2014-04-22 12:08:06 +0800 | [diff] [blame] | 1003 | #endif //if !defined(ARCH_X86_HAVE_SSSE3) |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1004 | } |
| 1005 | |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1006 | RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix( |
Jason Sams | c905efd | 2012-11-26 15:20:18 -0800 | [diff] [blame] | 1007 | RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) |
| 1008 | : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) { |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1009 | |
Jason Sams | a65de10 | 2013-08-09 13:42:28 -0700 | [diff] [blame] | 1010 | mLastKey.key = 0; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 1011 | mBuf = NULL; |
| 1012 | mBufSize = 0; |
| 1013 | mOptKernel = NULL; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1014 | const static float defaultMatrix[] = { |
| 1015 | 1.f, 0.f, 0.f, 0.f, |
| 1016 | 0.f, 1.f, 0.f, 0.f, |
| 1017 | 0.f, 0.f, 1.f, 0.f, |
| 1018 | 0.f, 0.f, 0.f, 1.f |
| 1019 | }; |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 1020 | const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f}; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1021 | setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix)); |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 1022 | setGlobalVar(1, defaultAdd, sizeof(defaultAdd)); |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1023 | } |
| 1024 | |
| 1025 | RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() { |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 1026 | if (mBuf) munmap(mBuf, mBufSize); |
| 1027 | mBuf = NULL; |
| 1028 | mOptKernel = NULL; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1029 | } |
| 1030 | |
| 1031 | void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) { |
Jason Sams | 9b2b9ef | 2013-07-29 17:38:00 -0700 | [diff] [blame] | 1032 | s->mHal.info.exportedVariableCount = 2; |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1033 | } |
| 1034 | |
Jason Sams | c905efd | 2012-11-26 15:20:18 -0800 | [diff] [blame] | 1035 | RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, |
| 1036 | const Script *s, const Element *e) { |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1037 | |
Jason Sams | c905efd | 2012-11-26 15:20:18 -0800 | [diff] [blame] | 1038 | return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e); |
Jason Sams | 709a097 | 2012-11-15 18:18:04 -0800 | [diff] [blame] | 1039 | } |