First draft of allocation sampling.

Change-Id: I82efe119bb05c52d8e9a1e2146dce5eb4b9f79a8
diff --git a/lib/ScriptCRT/rs_core.c b/lib/ScriptCRT/rs_core.c
index 0713149..ab38ec3 100644
--- a/lib/ScriptCRT/rs_core.c
+++ b/lib/ScriptCRT/rs_core.c
@@ -839,3 +839,278 @@
     }
     return element->mHal.state.vectorSize;
 }
+
+/**
+* Allocation sampling
+*/
+static const void * __attribute__((overloadable))
+        getElementAt(rs_allocation a, uint32_t x, uint32_t lod) {
+    Allocation_t *alloc = (Allocation_t *)a.p;
+    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
+    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.mallocPtr;
+
+    const uint32_t offset = type->mHal.state.lodOffset[lod];
+    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
+
+    return &p[offset + eSize * x];
+}
+
+static const void * __attribute__((overloadable))
+        getElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t lod) {
+    Allocation_t *alloc = (Allocation_t *)a.p;
+    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
+    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.mallocPtr;
+
+    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
+    const uint32_t offset = type->mHal.state.lodOffset[lod];
+    const uint32_t lodDimX = type->mHal.state.lodDimX[lod];
+
+    return &p[offset + eSize * (x + y * lodDimX)];
+}
+
+static int32_t wrapI(rs_sampler_value wrap, int32_t coord, int32_t size) {
+    if (wrap == RS_SAMPLER_WRAP) {
+        coord = coord % size;
+        if (coord < 0) {
+            coord += size;
+        }
+    }
+    return max(0, min(coord, size - 1));
+}
+
+#define convert_float(v) (float)v
+#define SAMPLE_1D_FUNC(vecsize)                                                                 \
+        static float##vecsize get1DSample##vecsize(rs_allocation a, float2 weights,             \
+                                                   int iPixel, int next, uint32_t lod) {        \
+            uchar##vecsize *p0c = (uchar##vecsize*)getElementAt(a, iPixel, lod);                \
+            uchar##vecsize *p1c = (uchar##vecsize*)getElementAt(a, next, lod);                  \
+            float##vecsize p0 = convert_float##vecsize(*p0c);                                   \
+            float##vecsize p1 = convert_float##vecsize(*p1c);                                   \
+            return p0 * weights.x + p1 * weights.y;                                             \
+        }
+#define SAMPLE_2D_FUNC(vecsize)                                                                 \
+        static float##vecsize get2DSample##vecsize(rs_allocation a, float4 weights,             \
+                                                   int2 iPixel, int nextX, int nextY,           \
+                                                   uint32_t lod) {                              \
+            uchar##vecsize *p0c = (uchar##vecsize*)getElementAt(a, iPixel.x, iPixel.y, lod);    \
+            uchar##vecsize *p1c = (uchar##vecsize*)getElementAt(a, nextX, iPixel.y, lod);       \
+            uchar##vecsize *p2c = (uchar##vecsize*)getElementAt(a, iPixel.x, nextY, lod);       \
+            uchar##vecsize *p3c = (uchar##vecsize*)getElementAt(a, nextX, nextY, lod);          \
+            float##vecsize p0 = convert_float##vecsize(*p0c);                                   \
+            float##vecsize p1 = convert_float##vecsize(*p1c);                                   \
+            float##vecsize p2 = convert_float##vecsize(*p2c);                                   \
+            float##vecsize p3 = convert_float##vecsize(*p3c);                                   \
+            return p0 * weights.x + p1 * weights.y + p2 * weights.z + p3 * weights.w;           \
+        }
+
+SAMPLE_1D_FUNC()
+SAMPLE_1D_FUNC(2)
+SAMPLE_1D_FUNC(3)
+SAMPLE_1D_FUNC(4)
+
+SAMPLE_2D_FUNC()
+SAMPLE_2D_FUNC(2)
+SAMPLE_2D_FUNC(3)
+SAMPLE_2D_FUNC(4)
+
+// TODO: implement 565
+static float4 getBilinearSample565(rs_allocation a, float4 weights,
+                                   int2 iPixel, int nextX, int nextY, uint32_t lod) {
+    float4 zero = {0.0f, 0.0f, 0.0f, 0.0f};
+    return zero;
+}
+
+static float4 getBilinearSample(rs_allocation a, float4 weights,
+                                int2 iPixel, int nextX, int nextY,
+                                uint32_t vecSize, rs_data_type dt, uint32_t lod) {
+    if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+        return getBilinearSample565(a, weights, iPixel, nextX, nextY, lod);
+    }
+
+    float4 result;
+    switch(vecSize) {
+    case 1:
+        result.x = get2DSample(a, weights, iPixel, nextX, nextY, lod);
+        break;
+    case 2:
+        result.xy = get2DSample2(a, weights, iPixel, nextX, nextY, lod);
+        break;
+    case 3:
+        result.xyz = get2DSample3(a, weights, iPixel, nextX, nextY, lod);
+        break;
+    case 4:
+        result = get2DSample4(a, weights, iPixel, nextX, nextY, lod);
+        break;
+    }
+
+    return result;
+}
+
+static float4 getNearestSample(rs_allocation a, int2 iPixel, uint32_t vecSize,
+                               rs_data_type dt, uint32_t lod) {
+    if (dt == RS_TYPE_UNSIGNED_5_6_5) {
+        float4 zero = {0.0f, 0.0f, 0.0f, 0.0f};
+        return zero;
+    }
+
+    float4 result;
+    switch(vecSize) {
+    case 1:
+        result.x = convert_float(*((uchar*)getElementAt(a, iPixel.x, iPixel.y, lod)));
+        break;
+    case 2:
+        result.xy = convert_float2(*((uchar2*)getElementAt(a, iPixel.x, iPixel.y, lod)));
+        break;
+    case 3:
+        result.xyz = convert_float3(*((uchar3*)getElementAt(a, iPixel.x, iPixel.y, lod)));
+        break;
+    case 4:
+        result = convert_float4(*((uchar4*)getElementAt(a, iPixel.x, iPixel.y, lod)));
+        break;
+    }
+
+    return result;
+}
+
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float location) {
+    return rsSample(a, s, location, 0);
+}
+
+//TODO: implement 1D sampling
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float location, float lod) {
+    float4 result;
+    return result;
+}
+
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float2 location) {
+    return rsSample(a, s, location, 0.0f);
+}
+
+static float4 sample_LOD_LinearPixel(rs_allocation a, const Type_t *type,
+                                     uint32_t vecSize, rs_data_type dt,
+                                     rs_sampler s,
+                                     float2 uv, uint32_t lod) {
+    rs_sampler_value wrapS = rsgSamplerGetWrapS(s);
+    rs_sampler_value wrapT = rsgSamplerGetWrapT(s);
+
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    int32_t sourceH = type->mHal.state.lodDimY[lod];
+
+    float2 dimF;
+    dimF.x = (float)(sourceW);
+    dimF.y = (float)(sourceH);
+    float2 pixelUV = uv * dimF;
+    int2 iPixel = convert_int2(pixelUV);
+
+    float2 frac = pixelUV - convert_float2(iPixel);
+
+    if (frac.x < 0.5f) {
+        iPixel.x -= 1;
+        frac.x += 0.5f;
+    } else {
+        frac.x -= 0.5f;
+    }
+    if (frac.y < 0.5f) {
+        iPixel.y -= 1;
+        frac.y += 0.5f;
+    } else {
+        frac.y -= 0.5f;
+    }
+    float2 oneMinusFrac = 1.0f - frac;
+
+    float4 weights;
+    weights.x = oneMinusFrac.x * oneMinusFrac.y;
+    weights.y = frac.x * oneMinusFrac.y;
+    weights.z = oneMinusFrac.x * frac.y;
+    weights.w = frac.x * frac.y;
+
+    int32_t nextX = wrapI(wrapS, iPixel.x + 1, sourceW);
+    int32_t nextY = wrapI(wrapT, iPixel.y + 1, sourceH);
+    iPixel.x = wrapI(wrapS, iPixel.x, sourceW);
+    iPixel.y = wrapI(wrapT, iPixel.y, sourceH);
+
+    return getBilinearSample(a, weights, iPixel, nextX, nextY, vecSize, dt, lod);
+}
+
+static float4 sample_LOD_NearestPixel(rs_allocation a, const Type_t *type,
+                                      uint32_t vecSize, rs_data_type dt,
+                                      rs_sampler s,
+                                      float2 uv, uint32_t lod) {
+    rs_sampler_value wrapS = rsgSamplerGetWrapS(s);
+    rs_sampler_value wrapT = rsgSamplerGetWrapT(s);
+
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    int32_t sourceH = type->mHal.state.lodDimY[lod];
+
+    float2 dimF;
+    dimF.x = (float)(sourceW);
+    dimF.y = (float)(sourceH);
+    int2 iPixel = convert_int2(uv * dimF);
+
+    iPixel.x = wrapI(wrapS, iPixel.x, sourceW);
+    iPixel.y = wrapI(wrapT, iPixel.y, sourceH);
+    return getNearestSample(a, iPixel, vecSize, dt, lod);
+}
+
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float2 uv, float lod) {
+    // Find out what kind of input data we are sampling
+    rs_element elem = rsAllocationGetElement(a);
+    rs_data_kind dk = rsElementGetDataKind(elem);
+    rs_data_type dt = rsElementGetDataType(elem);
+
+    if (dk == RS_KIND_USER || (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5)) {
+        float4 zero = {0.0f, 0.0f, 0.0f, 0.0f};
+        return zero;
+    }
+
+    uint32_t vecSize = rsElementGetVectorSize(elem);
+    Allocation_t *alloc = (Allocation_t *)a.p;
+    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
+
+    rs_sampler_value sampleMin = rsgSamplerGetMinification(s);
+    rs_sampler_value sampleMag = rsgSamplerGetMagnification(s);
+
+    if (sampleMin == RS_SAMPLER_NEAREST &&
+        sampleMag == RS_SAMPLER_NEAREST) {
+        return sample_LOD_NearestPixel(a, type, vecSize, dt, s, uv, 0);
+    }
+
+    if (sampleMin == RS_SAMPLER_LINEAR_MIP_NEAREST) {
+        // clamp the lod to between zero and the highest available
+        lod = clamp(lod, 0.0f, (float)(type->mHal.state.lodCount - 1));
+        uint32_t nearestLOD = (uint32_t)round(lod);
+        return sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, nearestLOD);
+    }
+
+    if (sampleMin == RS_SAMPLER_LINEAR_MIP_LINEAR) {
+        // clamp the lod to between zero and the highest available
+        lod = clamp(lod, 0.0f, (float)(type->mHal.state.lodCount - 1));
+        uint32_t lod0 = (uint32_t)floor(lod);
+        uint32_t lod1 = (uint32_t)ceil(lod);
+        float4 sample0 = sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, lod0);
+        float4 sample1 = sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, lod1);
+        float frac = lod - (float)lod0;
+        return sample0 * (1.0f - frac) + sample1 * frac;
+    }
+
+    return sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, 0);
+}
+
+// TODO: implement cubemap lookups
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float3 location) {
+    return rsSample(a, s, location, 0.0f);
+}
+
+// TODO: implement cubemap lookups
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float3 location, float lod) {
+    float4 result;
+    return result;
+}
+
+#undef convert_float