Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2012 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | |
| 18 | #include "rsCpuIntrinsic.h" |
| 19 | #include "rsCpuIntrinsicInlines.h" |
| 20 | |
| 21 | using namespace android; |
| 22 | using namespace android::renderscript; |
| 23 | |
| 24 | namespace android { |
| 25 | namespace renderscript { |
| 26 | |
| 27 | |
| 28 | class RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic { |
| 29 | public: |
| 30 | virtual void populateScript(Script *); |
| 31 | virtual void invokeFreeChildren(); |
| 32 | |
| 33 | virtual void setGlobalObj(uint32_t slot, ObjectBase *data); |
| 34 | |
| 35 | virtual ~RsdCpuScriptIntrinsic3DLUT(); |
| 36 | RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); |
| 37 | |
| 38 | protected: |
| 39 | ObjectBaseRef<Allocation> mLUT; |
| 40 | |
| 41 | static void kernel(const RsForEachStubParamStruct *p, |
| 42 | uint32_t xstart, uint32_t xend, |
| 43 | uint32_t instep, uint32_t outstep); |
| 44 | }; |
| 45 | |
| 46 | } |
| 47 | } |
| 48 | |
| 49 | |
| 50 | void RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) { |
| 51 | rsAssert(slot == 0); |
| 52 | mLUT.set(static_cast<Allocation *>(data)); |
| 53 | } |
| 54 | |
| 55 | extern "C" void rsdIntrinsic3DLUT_K(void *dst, const void *src, const void *lut, |
| 56 | size_t lut_stride_y, size_t lut_stride_z, |
| 57 | uint32_t count, const void *constants); |
| 58 | |
| 59 | |
| 60 | void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p, |
| 61 | uint32_t xstart, uint32_t xend, |
| 62 | uint32_t instep, uint32_t outstep) { |
| 63 | RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr; |
| 64 | |
| 65 | uchar4 *out = (uchar4 *)p->out; |
| 66 | uchar4 *in = (uchar4 *)p->in; |
| 67 | uint32_t x1 = xstart; |
| 68 | uint32_t x2 = xend; |
| 69 | |
| 70 | const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr; |
| 71 | |
| 72 | int4 dims = { |
Stephen Hines | d533c4c | 2013-03-06 02:55:32 -0800 | [diff] [blame] | 73 | cp->mLUT->mHal.drvState.lod[0].dimX - 1, |
| 74 | cp->mLUT->mHal.drvState.lod[0].dimY - 1, |
| 75 | cp->mLUT->mHal.drvState.lod[0].dimZ - 1, |
| 76 | -1 |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 77 | }; |
Stephen Hines | d533c4c | 2013-03-06 02:55:32 -0800 | [diff] [blame] | 78 | const float4 m = (float4)(1.f / 255.f) * convert_float4(dims); |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 79 | const int4 coordMul = convert_int4(m * (float4)0x8000); |
| 80 | const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride; |
| 81 | const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY; |
| 82 | |
| 83 | //ALOGE("strides %zu %zu", stride_y, stride_z); |
| 84 | |
| 85 | while (x1 < x2) { |
Jason Sams | f5ef8df | 2013-08-06 13:49:25 -0700 | [diff] [blame] | 86 | #if defined(ARCH_ARM_HAVE_VFP) |
| 87 | if (gArchUseSIMD) { |
| 88 | int32_t len = (x2 - x1 - 1) >> 1; |
| 89 | if(len > 0) { |
| 90 | const short neon_constants[] = { |
| 91 | coordMul.x, coordMul.y, coordMul.z, 0, |
| 92 | 0, 0, 0, 0xffff, |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 93 | |
Jason Sams | f5ef8df | 2013-08-06 13:49:25 -0700 | [diff] [blame] | 94 | }; |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 95 | |
Jason Sams | f5ef8df | 2013-08-06 13:49:25 -0700 | [diff] [blame] | 96 | rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants); |
| 97 | x1 += len << 1; |
| 98 | out += len << 1; |
| 99 | in += len << 1; |
| 100 | } |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 101 | } |
| 102 | |
| 103 | #endif |
| 104 | |
| 105 | int4 baseCoord = convert_int4(*in) * coordMul; |
| 106 | int4 coord1 = baseCoord >> (int4)15; |
| 107 | //int4 coord2 = min(coord1 + 1, gDims - 1); |
| 108 | |
| 109 | int4 weight2 = baseCoord & 0x7fff; |
| 110 | int4 weight1 = (int4)0x8000 - weight2; |
| 111 | |
| 112 | //ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w); |
| 113 | const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z); |
| 114 | const uchar4 *pt_00 = (const uchar4 *)&bp2[0]; |
| 115 | const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y]; |
| 116 | const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z]; |
| 117 | const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z]; |
| 118 | |
| 119 | uint4 v000 = convert_uint4(pt_00[0]); |
| 120 | uint4 v100 = convert_uint4(pt_00[1]); |
| 121 | uint4 v010 = convert_uint4(pt_10[0]); |
| 122 | uint4 v110 = convert_uint4(pt_10[1]); |
| 123 | uint4 v001 = convert_uint4(pt_01[0]); |
| 124 | uint4 v101 = convert_uint4(pt_01[1]); |
| 125 | uint4 v011 = convert_uint4(pt_11[0]); |
| 126 | uint4 v111 = convert_uint4(pt_11[1]); |
| 127 | |
| 128 | uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7; |
| 129 | uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7; |
| 130 | uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7; |
| 131 | uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7; |
| 132 | |
Stephen Hines | 5e3fb0b | 2013-01-10 01:45:46 -0800 | [diff] [blame] | 133 | uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15; |
| 134 | uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15; |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 135 | |
Stephen Hines | 5e3fb0b | 2013-01-10 01:45:46 -0800 | [diff] [blame] | 136 | uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15; |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 137 | uint4 v2 = (v + 0x7f) >> (int4)8; |
| 138 | |
| 139 | uchar4 ret = convert_uchar4(v2); |
Tim Murray | 0b575de | 2013-03-15 15:56:43 -0700 | [diff] [blame] | 140 | ret.w = in->w; |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 141 | |
| 142 | #if 0 |
| 143 | if (!x1) { |
| 144 | ALOGE("in %08x %08x %08x %08x", in->r, in->g, in->b, in->a); |
| 145 | ALOGE("baseCoord %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w); |
| 146 | ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w); |
| 147 | ALOGE("weight1 %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w); |
| 148 | ALOGE("weight2 %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w); |
| 149 | |
| 150 | ALOGE("v000 %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w); |
| 151 | ALOGE("v100 %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w); |
| 152 | ALOGE("yz00 %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w); |
| 153 | ALOGE("z0 %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w); |
| 154 | |
| 155 | ALOGE("v %08x %08x %08x %08x", v.x, v.y, v.z, v.w); |
| 156 | ALOGE("v2 %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w); |
| 157 | } |
| 158 | #endif |
| 159 | *out = ret; |
| 160 | |
| 161 | |
| 162 | in++; |
| 163 | out++; |
| 164 | x1++; |
| 165 | } |
Jason Sams | 7c4b888 | 2013-01-04 10:50:05 -0800 | [diff] [blame] | 166 | } |
| 167 | |
| 168 | RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, |
| 169 | const Script *s, const Element *e) |
| 170 | : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) { |
| 171 | |
| 172 | mRootPtr = &kernel; |
| 173 | } |
| 174 | |
| 175 | RsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() { |
| 176 | } |
| 177 | |
| 178 | void RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) { |
| 179 | s->mHal.info.exportedVariableCount = 1; |
| 180 | } |
| 181 | |
| 182 | void RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() { |
| 183 | mLUT.clear(); |
| 184 | } |
| 185 | |
| 186 | |
| 187 | RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx, |
| 188 | const Script *s, const Element *e) { |
| 189 | |
| 190 | return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e); |
| 191 | } |
| 192 | |
| 193 | |