Merge "Adding 1D sampling to the library."
diff --git a/lib/ScriptCRT/Android.mk b/lib/ScriptCRT/Android.mk
index 8b223f0..21e2efc 100644
--- a/lib/ScriptCRT/Android.mk
+++ b/lib/ScriptCRT/Android.mk
@@ -24,7 +24,8 @@
 # C source files for the library
 clcore_c_files := \
     rs_cl.c \
-    rs_core.c
+    rs_core.c \
+    rs_sample.c
 
 # Hand-written bitcode for the library
 clcore_ll_files := \
diff --git a/lib/ScriptCRT/rs_core.c b/lib/ScriptCRT/rs_core.c
index ab38ec3..44dcb09 100644
--- a/lib/ScriptCRT/rs_core.c
+++ b/lib/ScriptCRT/rs_core.c
@@ -1,254 +1,6 @@
 #include "rs_core.rsh"
 #include "rs_graphics.rsh"
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Allocation owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsAllocation.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsAllocationGetDimX(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * allocations.
- *
- *****************************************************************************/
-typedef enum {
-    RS_ALLOCATION_MIPMAP_NONE = 0,
-    RS_ALLOCATION_MIPMAP_FULL = 1,
-    RS_ALLOCATION_MIPMAP_ON_SYNC_TO_TEXTURE = 2
-} rs_allocation_mipmap_control;
-
-typedef struct Allocation {
-    char __pad[28];
-    struct {
-        void * drv;
-        struct {
-            const void *type;
-            uint32_t usageFlags;
-            rs_allocation_mipmap_control mipmapControl;
-            uint32_t dimensionX;
-            uint32_t dimensionY;
-            uint32_t dimensionZ;
-            uint32_t elementSizeBytes;
-            bool hasMipmaps;
-            bool hasFaces;
-            bool hasReferences;
-            void * usrPtr;
-            int32_t surfaceTextureID;
-            void * wndSurface;
-        } state;
-
-        struct DrvState {
-            void * mallocPtr;
-        } drvState;
-    } mHal;
-} Allocation_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class ProgramStore owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsProgramStore.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsgProgramStoreGetDepthFunc(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * program store.
- *
- *****************************************************************************/
-typedef struct ProgramStore {
-    char __pad[36];
-    struct {
-        struct {
-            bool ditherEnable;
-            bool colorRWriteEnable;
-            bool colorGWriteEnable;
-            bool colorBWriteEnable;
-            bool colorAWriteEnable;
-            rs_blend_src_func blendSrc;
-            rs_blend_dst_func blendDst;
-            bool depthWriteEnable;
-            rs_depth_func depthFunc;
-        } state;
-    } mHal;
-} ProgramStore_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class ProgramRaster owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsProgramRaster.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsgProgramRasterGetCullMode(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * program raster.
- *
- *****************************************************************************/
-typedef struct ProgramRaster {
-    char __pad[36];
-    struct {
-        struct {
-            bool pointSprite;
-            rs_cull_mode cull;
-        } state;
-    } mHal;
-} ProgramRaster_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Sampler owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsSampler.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsgProgramRasterGetMagFilter(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * samplers.
- *
- *****************************************************************************/
-typedef struct Sampler {
-    char __pad[32];
-    struct {
-        struct {
-            rs_sampler_value magFilter;
-            rs_sampler_value minFilter;
-            rs_sampler_value wrapS;
-            rs_sampler_value wrapT;
-            rs_sampler_value wrapR;
-            float aniso;
-        } state;
-    } mHal;
-} Sampler_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Element owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsElement.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsElementGetSubElementCount(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * elements.
- *
- *****************************************************************************/
-typedef struct Element {
-    char __pad[28];
-    struct {
-        void *drv;
-        struct {
-            rs_data_type dataType;
-            rs_data_kind dataKind;
-            uint32_t vectorSize;
-            uint32_t elementSizeBytes;
-
-            // Subelements
-            const void **fields;
-            uint32_t *fieldArraySizes;
-            const char **fieldNames;
-            uint32_t *fieldNameLengths;
-            uint32_t *fieldOffsetBytes;
-            uint32_t fieldsCount;
-        } state;
-    } mHal;
-} Element_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Type owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsType.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsAllocationGetElement(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * types.
- *
- *****************************************************************************/
-typedef struct Type {
-    char __pad[28];
-    struct {
-        void *drv;
-        struct {
-            const void * element;
-            uint32_t dimX;
-            uint32_t dimY;
-            uint32_t dimZ;
-            uint32_t *lodDimX;
-            uint32_t *lodDimY;
-            uint32_t *lodDimZ;
-            uint32_t *lodOffset;
-            uint32_t lodCount;
-            bool faces;
-        } state;
-    } mHal;
-} Type_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Mesh owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsMesh.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsMeshGetVertexAllocationCount(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * meshes.
- *
- *****************************************************************************/
-typedef struct Mesh {
-    char __pad[28];
-    struct {
-        void *drv;
-        struct {
-            void **vertexBuffers;
-            uint32_t vertexBuffersCount;
-
-            // indexBuffers[i] could be NULL, in which case only primitives[i] is used
-            void **indexBuffers;
-            uint32_t indexBuffersCount;
-            rs_primitive *primitives;
-            uint32_t primitivesCount;
-        } state;
-    } mHal;
-} Mesh_t;
-
+#include "rs_core.h"
 
 /* Declaration of 4 basic functions in libRS */
 extern void __attribute__((overloadable))
@@ -839,278 +591,3 @@
     }
     return element->mHal.state.vectorSize;
 }
-
-/**
-* Allocation sampling
-*/
-static const void * __attribute__((overloadable))
-        getElementAt(rs_allocation a, uint32_t x, uint32_t lod) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.mallocPtr;
-
-    const uint32_t offset = type->mHal.state.lodOffset[lod];
-    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-
-    return &p[offset + eSize * x];
-}
-
-static const void * __attribute__((overloadable))
-        getElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t lod) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.mallocPtr;
-
-    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    const uint32_t offset = type->mHal.state.lodOffset[lod];
-    const uint32_t lodDimX = type->mHal.state.lodDimX[lod];
-
-    return &p[offset + eSize * (x + y * lodDimX)];
-}
-
-static int32_t wrapI(rs_sampler_value wrap, int32_t coord, int32_t size) {
-    if (wrap == RS_SAMPLER_WRAP) {
-        coord = coord % size;
-        if (coord < 0) {
-            coord += size;
-        }
-    }
-    return max(0, min(coord, size - 1));
-}
-
-#define convert_float(v) (float)v
-#define SAMPLE_1D_FUNC(vecsize)                                                                 \
-        static float##vecsize get1DSample##vecsize(rs_allocation a, float2 weights,             \
-                                                   int iPixel, int next, uint32_t lod) {        \
-            uchar##vecsize *p0c = (uchar##vecsize*)getElementAt(a, iPixel, lod);                \
-            uchar##vecsize *p1c = (uchar##vecsize*)getElementAt(a, next, lod);                  \
-            float##vecsize p0 = convert_float##vecsize(*p0c);                                   \
-            float##vecsize p1 = convert_float##vecsize(*p1c);                                   \
-            return p0 * weights.x + p1 * weights.y;                                             \
-        }
-#define SAMPLE_2D_FUNC(vecsize)                                                                 \
-        static float##vecsize get2DSample##vecsize(rs_allocation a, float4 weights,             \
-                                                   int2 iPixel, int nextX, int nextY,           \
-                                                   uint32_t lod) {                              \
-            uchar##vecsize *p0c = (uchar##vecsize*)getElementAt(a, iPixel.x, iPixel.y, lod);    \
-            uchar##vecsize *p1c = (uchar##vecsize*)getElementAt(a, nextX, iPixel.y, lod);       \
-            uchar##vecsize *p2c = (uchar##vecsize*)getElementAt(a, iPixel.x, nextY, lod);       \
-            uchar##vecsize *p3c = (uchar##vecsize*)getElementAt(a, nextX, nextY, lod);          \
-            float##vecsize p0 = convert_float##vecsize(*p0c);                                   \
-            float##vecsize p1 = convert_float##vecsize(*p1c);                                   \
-            float##vecsize p2 = convert_float##vecsize(*p2c);                                   \
-            float##vecsize p3 = convert_float##vecsize(*p3c);                                   \
-            return p0 * weights.x + p1 * weights.y + p2 * weights.z + p3 * weights.w;           \
-        }
-
-SAMPLE_1D_FUNC()
-SAMPLE_1D_FUNC(2)
-SAMPLE_1D_FUNC(3)
-SAMPLE_1D_FUNC(4)
-
-SAMPLE_2D_FUNC()
-SAMPLE_2D_FUNC(2)
-SAMPLE_2D_FUNC(3)
-SAMPLE_2D_FUNC(4)
-
-// TODO: implement 565
-static float4 getBilinearSample565(rs_allocation a, float4 weights,
-                                   int2 iPixel, int nextX, int nextY, uint32_t lod) {
-    float4 zero = {0.0f, 0.0f, 0.0f, 0.0f};
-    return zero;
-}
-
-static float4 getBilinearSample(rs_allocation a, float4 weights,
-                                int2 iPixel, int nextX, int nextY,
-                                uint32_t vecSize, rs_data_type dt, uint32_t lod) {
-    if (dt == RS_TYPE_UNSIGNED_5_6_5) {
-        return getBilinearSample565(a, weights, iPixel, nextX, nextY, lod);
-    }
-
-    float4 result;
-    switch(vecSize) {
-    case 1:
-        result.x = get2DSample(a, weights, iPixel, nextX, nextY, lod);
-        break;
-    case 2:
-        result.xy = get2DSample2(a, weights, iPixel, nextX, nextY, lod);
-        break;
-    case 3:
-        result.xyz = get2DSample3(a, weights, iPixel, nextX, nextY, lod);
-        break;
-    case 4:
-        result = get2DSample4(a, weights, iPixel, nextX, nextY, lod);
-        break;
-    }
-
-    return result;
-}
-
-static float4 getNearestSample(rs_allocation a, int2 iPixel, uint32_t vecSize,
-                               rs_data_type dt, uint32_t lod) {
-    if (dt == RS_TYPE_UNSIGNED_5_6_5) {
-        float4 zero = {0.0f, 0.0f, 0.0f, 0.0f};
-        return zero;
-    }
-
-    float4 result;
-    switch(vecSize) {
-    case 1:
-        result.x = convert_float(*((uchar*)getElementAt(a, iPixel.x, iPixel.y, lod)));
-        break;
-    case 2:
-        result.xy = convert_float2(*((uchar2*)getElementAt(a, iPixel.x, iPixel.y, lod)));
-        break;
-    case 3:
-        result.xyz = convert_float3(*((uchar3*)getElementAt(a, iPixel.x, iPixel.y, lod)));
-        break;
-    case 4:
-        result = convert_float4(*((uchar4*)getElementAt(a, iPixel.x, iPixel.y, lod)));
-        break;
-    }
-
-    return result;
-}
-
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float location) {
-    return rsSample(a, s, location, 0);
-}
-
-//TODO: implement 1D sampling
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float location, float lod) {
-    float4 result;
-    return result;
-}
-
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float2 location) {
-    return rsSample(a, s, location, 0.0f);
-}
-
-static float4 sample_LOD_LinearPixel(rs_allocation a, const Type_t *type,
-                                     uint32_t vecSize, rs_data_type dt,
-                                     rs_sampler s,
-                                     float2 uv, uint32_t lod) {
-    rs_sampler_value wrapS = rsgSamplerGetWrapS(s);
-    rs_sampler_value wrapT = rsgSamplerGetWrapT(s);
-
-    int32_t sourceW = type->mHal.state.lodDimX[lod];
-    int32_t sourceH = type->mHal.state.lodDimY[lod];
-
-    float2 dimF;
-    dimF.x = (float)(sourceW);
-    dimF.y = (float)(sourceH);
-    float2 pixelUV = uv * dimF;
-    int2 iPixel = convert_int2(pixelUV);
-
-    float2 frac = pixelUV - convert_float2(iPixel);
-
-    if (frac.x < 0.5f) {
-        iPixel.x -= 1;
-        frac.x += 0.5f;
-    } else {
-        frac.x -= 0.5f;
-    }
-    if (frac.y < 0.5f) {
-        iPixel.y -= 1;
-        frac.y += 0.5f;
-    } else {
-        frac.y -= 0.5f;
-    }
-    float2 oneMinusFrac = 1.0f - frac;
-
-    float4 weights;
-    weights.x = oneMinusFrac.x * oneMinusFrac.y;
-    weights.y = frac.x * oneMinusFrac.y;
-    weights.z = oneMinusFrac.x * frac.y;
-    weights.w = frac.x * frac.y;
-
-    int32_t nextX = wrapI(wrapS, iPixel.x + 1, sourceW);
-    int32_t nextY = wrapI(wrapT, iPixel.y + 1, sourceH);
-    iPixel.x = wrapI(wrapS, iPixel.x, sourceW);
-    iPixel.y = wrapI(wrapT, iPixel.y, sourceH);
-
-    return getBilinearSample(a, weights, iPixel, nextX, nextY, vecSize, dt, lod);
-}
-
-static float4 sample_LOD_NearestPixel(rs_allocation a, const Type_t *type,
-                                      uint32_t vecSize, rs_data_type dt,
-                                      rs_sampler s,
-                                      float2 uv, uint32_t lod) {
-    rs_sampler_value wrapS = rsgSamplerGetWrapS(s);
-    rs_sampler_value wrapT = rsgSamplerGetWrapT(s);
-
-    int32_t sourceW = type->mHal.state.lodDimX[lod];
-    int32_t sourceH = type->mHal.state.lodDimY[lod];
-
-    float2 dimF;
-    dimF.x = (float)(sourceW);
-    dimF.y = (float)(sourceH);
-    int2 iPixel = convert_int2(uv * dimF);
-
-    iPixel.x = wrapI(wrapS, iPixel.x, sourceW);
-    iPixel.y = wrapI(wrapT, iPixel.y, sourceH);
-    return getNearestSample(a, iPixel, vecSize, dt, lod);
-}
-
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float2 uv, float lod) {
-    // Find out what kind of input data we are sampling
-    rs_element elem = rsAllocationGetElement(a);
-    rs_data_kind dk = rsElementGetDataKind(elem);
-    rs_data_type dt = rsElementGetDataType(elem);
-
-    if (dk == RS_KIND_USER || (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5)) {
-        float4 zero = {0.0f, 0.0f, 0.0f, 0.0f};
-        return zero;
-    }
-
-    uint32_t vecSize = rsElementGetVectorSize(elem);
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
-
-    rs_sampler_value sampleMin = rsgSamplerGetMinification(s);
-    rs_sampler_value sampleMag = rsgSamplerGetMagnification(s);
-
-    if (sampleMin == RS_SAMPLER_NEAREST &&
-        sampleMag == RS_SAMPLER_NEAREST) {
-        return sample_LOD_NearestPixel(a, type, vecSize, dt, s, uv, 0);
-    }
-
-    if (sampleMin == RS_SAMPLER_LINEAR_MIP_NEAREST) {
-        // clamp the lod to between zero and the highest available
-        lod = clamp(lod, 0.0f, (float)(type->mHal.state.lodCount - 1));
-        uint32_t nearestLOD = (uint32_t)round(lod);
-        return sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, nearestLOD);
-    }
-
-    if (sampleMin == RS_SAMPLER_LINEAR_MIP_LINEAR) {
-        // clamp the lod to between zero and the highest available
-        lod = clamp(lod, 0.0f, (float)(type->mHal.state.lodCount - 1));
-        uint32_t lod0 = (uint32_t)floor(lod);
-        uint32_t lod1 = (uint32_t)ceil(lod);
-        float4 sample0 = sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, lod0);
-        float4 sample1 = sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, lod1);
-        float frac = lod - (float)lod0;
-        return sample0 * (1.0f - frac) + sample1 * frac;
-    }
-
-    return sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, 0);
-}
-
-// TODO: implement cubemap lookups
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float3 location) {
-    return rsSample(a, s, location, 0.0f);
-}
-
-// TODO: implement cubemap lookups
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float3 location, float lod) {
-    float4 result;
-    return result;
-}
-
-#undef convert_float
diff --git a/lib/ScriptCRT/rs_core.h b/lib/ScriptCRT/rs_core.h
new file mode 100644
index 0000000..f90579c
--- /dev/null
+++ b/lib/ScriptCRT/rs_core.h
@@ -0,0 +1,252 @@
+#ifndef _RS_CORE_H_
+#define _RS_CORE_H_
+
+/*****************************************************************************
+ * CAUTION
+ *
+ * The following structure layout provides a more efficient way to access
+ * internal members of the C++ class Allocation owned by librs. Unfortunately,
+ * since this class has virtual members, we can't simply use offsetof() or any
+ * other compiler trickery to dynamically get the appropriate values at
+ * build-time. This layout may need to be updated whenever
+ * frameworks/base/libs/rs/rsAllocation.h is modified.
+ *
+ * Having the layout information available in this file allows us to
+ * accelerate functionality like rsAllocationGetDimX(). Without this
+ * information, we would not be able to inline the bitcode, thus resulting in
+ * potential runtime performance penalties for tight loops operating on
+ * allocations.
+ *
+ *****************************************************************************/
+typedef enum {
+    RS_ALLOCATION_MIPMAP_NONE = 0,
+    RS_ALLOCATION_MIPMAP_FULL = 1,
+    RS_ALLOCATION_MIPMAP_ON_SYNC_TO_TEXTURE = 2
+} rs_allocation_mipmap_control;
+
+typedef struct Allocation {
+    char __pad[28];
+    struct {
+        void * drv;
+        struct {
+            const void *type;
+            uint32_t usageFlags;
+            rs_allocation_mipmap_control mipmapControl;
+            uint32_t dimensionX;
+            uint32_t dimensionY;
+            uint32_t dimensionZ;
+            uint32_t elementSizeBytes;
+            bool hasMipmaps;
+            bool hasFaces;
+            bool hasReferences;
+            void * usrPtr;
+            int32_t surfaceTextureID;
+            void * wndSurface;
+        } state;
+
+        struct DrvState {
+            void * mallocPtr;
+        } drvState;
+    } mHal;
+} Allocation_t;
+
+/*****************************************************************************
+ * CAUTION
+ *
+ * The following structure layout provides a more efficient way to access
+ * internal members of the C++ class ProgramStore owned by librs. Unfortunately,
+ * since this class has virtual members, we can't simply use offsetof() or any
+ * other compiler trickery to dynamically get the appropriate values at
+ * build-time. This layout may need to be updated whenever
+ * frameworks/base/libs/rs/rsProgramStore.h is modified.
+ *
+ * Having the layout information available in this file allows us to
+ * accelerate functionality like rsgProgramStoreGetDepthFunc(). Without this
+ * information, we would not be able to inline the bitcode, thus resulting in
+ * potential runtime performance penalties for tight loops operating on
+ * program store.
+ *
+ *****************************************************************************/
+typedef struct ProgramStore {
+    char __pad[36];
+    struct {
+        struct {
+            bool ditherEnable;
+            bool colorRWriteEnable;
+            bool colorGWriteEnable;
+            bool colorBWriteEnable;
+            bool colorAWriteEnable;
+            rs_blend_src_func blendSrc;
+            rs_blend_dst_func blendDst;
+            bool depthWriteEnable;
+            rs_depth_func depthFunc;
+        } state;
+    } mHal;
+} ProgramStore_t;
+
+/*****************************************************************************
+ * CAUTION
+ *
+ * The following structure layout provides a more efficient way to access
+ * internal members of the C++ class ProgramRaster owned by librs. Unfortunately,
+ * since this class has virtual members, we can't simply use offsetof() or any
+ * other compiler trickery to dynamically get the appropriate values at
+ * build-time. This layout may need to be updated whenever
+ * frameworks/base/libs/rs/rsProgramRaster.h is modified.
+ *
+ * Having the layout information available in this file allows us to
+ * accelerate functionality like rsgProgramRasterGetCullMode(). Without this
+ * information, we would not be able to inline the bitcode, thus resulting in
+ * potential runtime performance penalties for tight loops operating on
+ * program raster.
+ *
+ *****************************************************************************/
+typedef struct ProgramRaster {
+    char __pad[36];
+    struct {
+        struct {
+            bool pointSprite;
+            rs_cull_mode cull;
+        } state;
+    } mHal;
+} ProgramRaster_t;
+
+/*****************************************************************************
+ * CAUTION
+ *
+ * The following structure layout provides a more efficient way to access
+ * internal members of the C++ class Sampler owned by librs. Unfortunately,
+ * since this class has virtual members, we can't simply use offsetof() or any
+ * other compiler trickery to dynamically get the appropriate values at
+ * build-time. This layout may need to be updated whenever
+ * frameworks/base/libs/rs/rsSampler.h is modified.
+ *
+ * Having the layout information available in this file allows us to
+ * accelerate functionality like rsgProgramRasterGetMagFilter(). Without this
+ * information, we would not be able to inline the bitcode, thus resulting in
+ * potential runtime performance penalties for tight loops operating on
+ * samplers.
+ *
+ *****************************************************************************/
+typedef struct Sampler {
+    char __pad[32];
+    struct {
+        struct {
+            rs_sampler_value magFilter;
+            rs_sampler_value minFilter;
+            rs_sampler_value wrapS;
+            rs_sampler_value wrapT;
+            rs_sampler_value wrapR;
+            float aniso;
+        } state;
+    } mHal;
+} Sampler_t;
+
+/*****************************************************************************
+ * CAUTION
+ *
+ * The following structure layout provides a more efficient way to access
+ * internal members of the C++ class Element owned by librs. Unfortunately,
+ * since this class has virtual members, we can't simply use offsetof() or any
+ * other compiler trickery to dynamically get the appropriate values at
+ * build-time. This layout may need to be updated whenever
+ * frameworks/base/libs/rs/rsElement.h is modified.
+ *
+ * Having the layout information available in this file allows us to
+ * accelerate functionality like rsElementGetSubElementCount(). Without this
+ * information, we would not be able to inline the bitcode, thus resulting in
+ * potential runtime performance penalties for tight loops operating on
+ * elements.
+ *
+ *****************************************************************************/
+typedef struct Element {
+    char __pad[28];
+    struct {
+        void *drv;
+        struct {
+            rs_data_type dataType;
+            rs_data_kind dataKind;
+            uint32_t vectorSize;
+            uint32_t elementSizeBytes;
+
+            // Subelements
+            const void **fields;
+            uint32_t *fieldArraySizes;
+            const char **fieldNames;
+            uint32_t *fieldNameLengths;
+            uint32_t *fieldOffsetBytes;
+            uint32_t fieldsCount;
+        } state;
+    } mHal;
+} Element_t;
+
+/*****************************************************************************
+ * CAUTION
+ *
+ * The following structure layout provides a more efficient way to access
+ * internal members of the C++ class Type owned by librs. Unfortunately,
+ * since this class has virtual members, we can't simply use offsetof() or any
+ * other compiler trickery to dynamically get the appropriate values at
+ * build-time. This layout may need to be updated whenever
+ * frameworks/base/libs/rs/rsType.h is modified.
+ *
+ * Having the layout information available in this file allows us to
+ * accelerate functionality like rsAllocationGetElement(). Without this
+ * information, we would not be able to inline the bitcode, thus resulting in
+ * potential runtime performance penalties for tight loops operating on
+ * types.
+ *
+ *****************************************************************************/
+typedef struct Type {
+    char __pad[28];
+    struct {
+        void *drv;
+        struct {
+            const void * element;
+            uint32_t dimX;
+            uint32_t dimY;
+            uint32_t dimZ;
+            uint32_t *lodDimX;
+            uint32_t *lodDimY;
+            uint32_t *lodDimZ;
+            uint32_t *lodOffset;
+            uint32_t lodCount;
+            bool faces;
+        } state;
+    } mHal;
+} Type_t;
+
+/*****************************************************************************
+ * CAUTION
+ *
+ * The following structure layout provides a more efficient way to access
+ * internal members of the C++ class Mesh owned by librs. Unfortunately,
+ * since this class has virtual members, we can't simply use offsetof() or any
+ * other compiler trickery to dynamically get the appropriate values at
+ * build-time. This layout may need to be updated whenever
+ * frameworks/base/libs/rs/rsMesh.h is modified.
+ *
+ * Having the layout information available in this file allows us to
+ * accelerate functionality like rsMeshGetVertexAllocationCount(). Without this
+ * information, we would not be able to inline the bitcode, thus resulting in
+ * potential runtime performance penalties for tight loops operating on
+ * meshes.
+ *
+ *****************************************************************************/
+typedef struct Mesh {
+    char __pad[28];
+    struct {
+        void *drv;
+        struct {
+            void **vertexBuffers;
+            uint32_t vertexBuffersCount;
+
+            // indexBuffers[i] could be NULL, in which case only primitives[i] is used
+            void **indexBuffers;
+            uint32_t indexBuffersCount;
+            rs_primitive *primitives;
+            uint32_t primitivesCount;
+        } state;
+    } mHal;
+} Mesh_t;
+#endif // _RS_CORE_H_
diff --git a/lib/ScriptCRT/rs_sample.c b/lib/ScriptCRT/rs_sample.c
new file mode 100644
index 0000000..bf192c5
--- /dev/null
+++ b/lib/ScriptCRT/rs_sample.c
@@ -0,0 +1,394 @@
+#include "rs_core.rsh"
+#include "rs_graphics.rsh"
+#include "rs_core.h"
+
+/**
+* Allocation sampling
+*/
+static const void * __attribute__((overloadable))
+        getElementAt1D(rs_allocation a, uint32_t x, uint32_t lod) {
+    Allocation_t *alloc = (Allocation_t *)a.p;
+    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
+    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.mallocPtr;
+
+    const uint32_t offset = type->mHal.state.lodOffset[lod];
+    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
+
+    return &p[offset + eSize * x];
+}
+
+static const void * __attribute__((overloadable))
+        getElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t lod) {
+    Allocation_t *alloc = (Allocation_t *)a.p;
+    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
+    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.mallocPtr;
+
+    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
+    const uint32_t offset = type->mHal.state.lodOffset[lod];
+    const uint32_t lodDimX = type->mHal.state.lodDimX[lod];
+
+    return &p[offset + eSize * (x + y * lodDimX)];
+}
+
+static const void * __attribute__((overloadable))
+        getElementAt(rs_allocation a, int2 uv, uint32_t lod) {
+    return getElementAt(a, uv.x, uv.y, lod);
+}
+
+static int32_t wrapI(rs_sampler_value wrap, int32_t coord, int32_t size) {
+    if (wrap == RS_SAMPLER_WRAP) {
+        coord = coord % size;
+        if (coord < 0) {
+            coord += size;
+        }
+    }
+    return max(0, min(coord, size - 1));
+}
+
+// 565 Conversion bits taken from SkBitmap
+#define SK_R16_BITS     5
+#define SK_G16_BITS     6
+#define SK_B16_BITS     5
+
+#define SK_R16_SHIFT    (SK_B16_BITS + SK_G16_BITS)
+#define SK_G16_SHIFT    (SK_B16_BITS)
+#define SK_B16_SHIFT    0
+
+#define SK_R16_MASK     ((1 << SK_R16_BITS) - 1)
+#define SK_G16_MASK     ((1 << SK_G16_BITS) - 1)
+#define SK_B16_MASK     ((1 << SK_B16_BITS) - 1)
+
+#define SkGetPackedR16(color)   (((unsigned)(color) >> SK_R16_SHIFT) & SK_R16_MASK)
+#define SkGetPackedG16(color)   (((unsigned)(color) >> SK_G16_SHIFT) & SK_G16_MASK)
+#define SkGetPackedB16(color)   (((unsigned)(color) >> SK_B16_SHIFT) & SK_B16_MASK)
+
+static inline unsigned SkR16ToR32(unsigned r) {
+    return (r << (8 - SK_R16_BITS)) | (r >> (2 * SK_R16_BITS - 8));
+}
+
+static inline unsigned SkG16ToG32(unsigned g) {
+    return (g << (8 - SK_G16_BITS)) | (g >> (2 * SK_G16_BITS - 8));
+}
+
+static inline unsigned SkB16ToB32(unsigned b) {
+    return (b << (8 - SK_B16_BITS)) | (b >> (2 * SK_B16_BITS - 8));
+}
+
+#define SkPacked16ToR32(c)      SkR16ToR32(SkGetPackedR16(c))
+#define SkPacked16ToG32(c)      SkG16ToG32(SkGetPackedG16(c))
+#define SkPacked16ToB32(c)      SkB16ToB32(SkGetPackedB16(c))
+
+static float3 getFrom565(uint16_t color) {
+    float3 result;
+    result.x = (float)SkPacked16ToR32(color);
+    result.y = (float)SkPacked16ToG32(color);
+    result.z = (float)SkPacked16ToB32(color);
+    return result;
+}
+
+#define SAMPLE_1D_FUNC(vecsize, intype, outtype, convert)                                       \
+        static outtype __attribute__((overloadable))                                            \
+                getSample##vecsize(rs_allocation a, float2 weights,                             \
+                                   int iPixel, int next, uint32_t lod) {                        \
+            intype *p0c = (intype*)getElementAt1D(a, iPixel, lod);                              \
+            intype *p1c = (intype*)getElementAt1D(a, next, lod);                                \
+            outtype p0 = convert(*p0c);                                                         \
+            outtype p1 = convert(*p1c);                                                         \
+            return p0 * weights.x + p1 * weights.y;                                             \
+        }
+#define SAMPLE_2D_FUNC(vecsize, intype, outtype, convert)                                       \
+        static outtype __attribute__((overloadable))                                            \
+                    getSample##vecsize(rs_allocation a, float4 weights,                         \
+                                       int2 iPixel, int2 next, uint32_t lod) {                  \
+            intype *p0c = (intype*)getElementAt(a, iPixel.x, iPixel.y, lod);                    \
+            intype *p1c = (intype*)getElementAt(a, next.x, iPixel.y, lod);                      \
+            intype *p2c = (intype*)getElementAt(a, iPixel.x, next.y, lod);                      \
+            intype *p3c = (intype*)getElementAt(a, next.x, next.y, lod);                        \
+            outtype p0 = convert(*p0c);                                                         \
+            outtype p1 = convert(*p1c);                                                         \
+            outtype p2 = convert(*p2c);                                                         \
+            outtype p3 = convert(*p3c);                                                         \
+            return p0 * weights.x + p1 * weights.y + p2 * weights.z + p3 * weights.w;           \
+        }
+
+SAMPLE_1D_FUNC(1, uchar, float, (float))
+SAMPLE_1D_FUNC(2, uchar2, float2, convert_float2)
+SAMPLE_1D_FUNC(3, uchar3, float3, convert_float3)
+SAMPLE_1D_FUNC(4, uchar4, float4, convert_float4)
+SAMPLE_1D_FUNC(565, uint16_t, float3, getFrom565)
+
+SAMPLE_2D_FUNC(1, uchar, float, (float))
+SAMPLE_2D_FUNC(2, uchar2, float2, convert_float2)
+SAMPLE_2D_FUNC(3, uchar3, float3, convert_float3)
+SAMPLE_2D_FUNC(4, uchar4, float4, convert_float4)
+SAMPLE_2D_FUNC(565, uint16_t, float3, getFrom565)
+
+// Sampler function body is the same for all dimensions
+#define SAMPLE_FUNC_BODY()                                                                      \
+{                                                                                               \
+    rs_element elem = rsAllocationGetElement(a);                                                \
+    rs_data_kind dk = rsElementGetDataKind(elem);                                               \
+    rs_data_type dt = rsElementGetDataType(elem);                                               \
+                                                                                                \
+    if (dk == RS_KIND_USER || (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5)) {     \
+        float4 zero = {0.0f, 0.0f, 0.0f, 0.0f};                                                 \
+        return zero;                                                                            \
+    }                                                                                           \
+                                                                                                \
+    uint32_t vecSize = rsElementGetVectorSize(elem);                                            \
+    Allocation_t *alloc = (Allocation_t *)a.p;                                                  \
+    const Type_t *type = (const Type_t*)alloc->mHal.state.type;                                 \
+                                                                                                \
+    rs_sampler_value sampleMin = rsgSamplerGetMinification(s);                                  \
+    rs_sampler_value sampleMag = rsgSamplerGetMagnification(s);                                 \
+                                                                                                \
+    if (lod <= 0.0f) {                                                                          \
+        if (sampleMag == RS_SAMPLER_NEAREST) {                                                  \
+            return sample_LOD_NearestPixel(a, type, vecSize, dt, s, uv, 0);                     \
+        }                                                                                       \
+        return sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, 0);                          \
+    }                                                                                           \
+                                                                                                \
+    if (sampleMin == RS_SAMPLER_LINEAR_MIP_NEAREST) {                                           \
+        lod = clamp(lod, 0.0f, (float)(type->mHal.state.lodCount - 1));                         \
+        uint32_t nearestLOD = (uint32_t)round(lod);                                             \
+        return sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, nearestLOD);                 \
+    }                                                                                           \
+                                                                                                \
+    if (sampleMin == RS_SAMPLER_LINEAR_MIP_LINEAR) {                                            \
+        lod = clamp(lod, 0.0f, (float)(type->mHal.state.lodCount - 1));                         \
+        uint32_t lod0 = (uint32_t)floor(lod);                                                   \
+        uint32_t lod1 = (uint32_t)ceil(lod);                                                    \
+        float4 sample0 = sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, lod0);             \
+        float4 sample1 = sample_LOD_LinearPixel(a, type, vecSize, dt, s, uv, lod1);             \
+        float frac = lod - (float)lod0;                                                         \
+        return sample0 * (1.0f - frac) + sample1 * frac;                                        \
+    }                                                                                           \
+                                                                                                \
+    return sample_LOD_NearestPixel(a, type, vecSize, dt, s, uv, 0);                             \
+} // End of sampler function body is the same for all dimensions
+
+// Body of the bilinear sampling function
+#define BILINEAR_SAMPLE_BODY()                                                                  \
+{                                                                                               \
+    float4 result;                                                                              \
+    if (dt == RS_TYPE_UNSIGNED_5_6_5) {                                                         \
+        result.xyz = getSample565(a, weights, iPixel, next, lod);                               \
+        return result;                                                                          \
+    }                                                                                           \
+                                                                                                \
+    switch(vecSize) {                                                                           \
+    case 1:                                                                                     \
+        result.x = getSample1(a, weights, iPixel, next, lod);                                   \
+        break;                                                                                  \
+    case 2:                                                                                     \
+        result.xy = getSample2(a, weights, iPixel, next, lod);                                  \
+        break;                                                                                  \
+    case 3:                                                                                     \
+        result.xyz = getSample3(a, weights, iPixel, next, lod);                                 \
+        break;                                                                                  \
+    case 4:                                                                                     \
+        result = getSample4(a, weights, iPixel, next, lod);                                     \
+        break;                                                                                  \
+    }                                                                                           \
+                                                                                                \
+    return result;                                                                              \
+} // End of body of the bilinear sampling function
+
+// Body of the nearest sampling function
+#define NEAREST_SAMPLE_BODY()                                                                   \
+{                                                                                               \
+    float4 result;                                                                              \
+    if (dt == RS_TYPE_UNSIGNED_5_6_5) {                                                         \
+        result.xyz = getFrom565(*(uint16_t*)getElementAt(a, iPixel, lod));                      \
+       return result;                                                                           \
+    }                                                                                           \
+                                                                                                \
+    switch(vecSize) {                                                                           \
+    case 1:                                                                                     \
+        result.x = (float)(*((uchar*)getElementAt(a, iPixel, lod)));                            \
+        break;                                                                                  \
+    case 2:                                                                                     \
+        result.xy = convert_float2(*((uchar2*)getElementAt(a, iPixel, lod)));                   \
+        break;                                                                                  \
+    case 3:                                                                                     \
+        result.xyz = convert_float3(*((uchar3*)getElementAt(a, iPixel, lod)));                  \
+        break;                                                                                  \
+    case 4:                                                                                     \
+        result = convert_float4(*((uchar4*)getElementAt(a, iPixel, lod)));                      \
+        break;                                                                                  \
+    }                                                                                           \
+                                                                                                \
+    return result;                                                                              \
+} // End of body of the nearest sampling function
+
+static float4 __attribute__((overloadable))
+        getBilinearSample(rs_allocation a, float2 weights,
+                          int32_t iPixel, int32_t next,
+                          uint32_t vecSize, rs_data_type dt, uint32_t lod) {
+    BILINEAR_SAMPLE_BODY()
+}
+
+static float4 __attribute__((overloadable))
+        getBilinearSample(rs_allocation a, float4 weights,
+                          int2 iPixel, int2 next,
+                          uint32_t vecSize, rs_data_type dt, uint32_t lod) {
+    BILINEAR_SAMPLE_BODY()
+}
+
+static float4  __attribute__((overloadable))
+        getNearestSample(rs_allocation a, int32_t iPixel, uint32_t vecSize,
+                         rs_data_type dt, uint32_t lod) {
+    NEAREST_SAMPLE_BODY()
+}
+
+static float4  __attribute__((overloadable))
+        getNearestSample(rs_allocation a, int2 iPixel, uint32_t vecSize,
+                         rs_data_type dt, uint32_t lod) {
+    NEAREST_SAMPLE_BODY()
+}
+
+static float4 __attribute__((overloadable))
+        sample_LOD_LinearPixel(rs_allocation a, const Type_t *type,
+                               uint32_t vecSize, rs_data_type dt,
+                               rs_sampler s,
+                               float uv, uint32_t lod) {
+    rs_sampler_value wrapS = rsgSamplerGetWrapS(s);
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    float pixelUV = uv * (float)(sourceW);
+    int32_t iPixel = (int32_t)(pixelUV);
+    float frac = pixelUV - (float)iPixel;
+
+    if (frac < 0.5f) {
+        iPixel -= 1;
+        frac += 0.5f;
+    } else {
+        frac -= 0.5f;
+    }
+
+    float oneMinusFrac = 1.0f - frac;
+
+    float2 weights;
+    weights.x = oneMinusFrac;
+    weights.y = frac;
+
+    int32_t next = wrapI(wrapS, iPixel + 1, sourceW);
+    iPixel = wrapI(wrapS, iPixel, sourceW);
+
+    return getBilinearSample(a, weights, iPixel, next, vecSize, dt, lod);
+}
+
+static float4 __attribute__((overloadable))
+        sample_LOD_NearestPixel(rs_allocation a, const Type_t *type,
+                                uint32_t vecSize, rs_data_type dt,
+                                rs_sampler s,
+                                float uv, uint32_t lod) {
+    rs_sampler_value wrapS = rsgSamplerGetWrapS(s);
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    int32_t iPixel = (int32_t)(uv * (float)(sourceW));
+    iPixel = wrapI(wrapS, iPixel, sourceW);
+
+    return getNearestSample(a, iPixel, vecSize, dt, lod);
+}
+
+static float4 __attribute__((overloadable))
+        sample_LOD_LinearPixel(rs_allocation a, const Type_t *type,
+                               uint32_t vecSize, rs_data_type dt,
+                               rs_sampler s,
+                               float2 uv, uint32_t lod) {
+    rs_sampler_value wrapS = rsgSamplerGetWrapS(s);
+    rs_sampler_value wrapT = rsgSamplerGetWrapT(s);
+
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    int32_t sourceH = type->mHal.state.lodDimY[lod];
+
+    float2 dimF;
+    dimF.x = (float)(sourceW);
+    dimF.y = (float)(sourceH);
+    float2 pixelUV = uv * dimF;
+    int2 iPixel = convert_int2(pixelUV);
+
+    float2 frac = pixelUV - convert_float2(iPixel);
+
+    if (frac.x < 0.5f) {
+        iPixel.x -= 1;
+        frac.x += 0.5f;
+    } else {
+        frac.x -= 0.5f;
+    }
+    if (frac.y < 0.5f) {
+        iPixel.y -= 1;
+        frac.y += 0.5f;
+    } else {
+        frac.y -= 0.5f;
+    }
+    float2 oneMinusFrac = 1.0f - frac;
+
+    float4 weights;
+    weights.x = oneMinusFrac.x * oneMinusFrac.y;
+    weights.y = frac.x * oneMinusFrac.y;
+    weights.z = oneMinusFrac.x * frac.y;
+    weights.w = frac.x * frac.y;
+
+    int2 next;
+    next.x = wrapI(wrapS, iPixel.x + 1, sourceW);
+    next.y = wrapI(wrapT, iPixel.y + 1, sourceH);
+    iPixel.x = wrapI(wrapS, iPixel.x, sourceW);
+    iPixel.y = wrapI(wrapT, iPixel.y, sourceH);
+
+    return getBilinearSample(a, weights, iPixel, next, vecSize, dt, lod);
+}
+
+static float4 __attribute__((overloadable))
+        sample_LOD_NearestPixel(rs_allocation a, const Type_t *type,
+                                uint32_t vecSize, rs_data_type dt,
+                                rs_sampler s,
+                                float2 uv, uint32_t lod) {
+    rs_sampler_value wrapS = rsgSamplerGetWrapS(s);
+    rs_sampler_value wrapT = rsgSamplerGetWrapT(s);
+
+    int32_t sourceW = type->mHal.state.lodDimX[lod];
+    int32_t sourceH = type->mHal.state.lodDimY[lod];
+
+    float2 dimF;
+    dimF.x = (float)(sourceW);
+    dimF.y = (float)(sourceH);
+    int2 iPixel = convert_int2(uv * dimF);
+
+    iPixel.x = wrapI(wrapS, iPixel.x, sourceW);
+    iPixel.y = wrapI(wrapT, iPixel.y, sourceH);
+    return getNearestSample(a, iPixel, vecSize, dt, lod);
+}
+
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float location) {
+    return rsSample(a, s, location, 0);
+}
+
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float uv, float lod) {
+    SAMPLE_FUNC_BODY()
+}
+
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float2 location) {
+    return rsSample(a, s, location, 0.0f);
+}
+
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float2 uv, float lod) {
+    SAMPLE_FUNC_BODY()
+}
+
+// TODO: implement cubemap lookups
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float3 location) {
+    return rsSample(a, s, location, 0.0f);
+}
+
+// TODO: implement cubemap lookups
+extern const float4 __attribute__((overloadable))
+        rsSample(rs_allocation a, rs_sampler s, float3 location, float lod) {
+    float4 result;
+    return result;
+}