am 0687cec6: am cb341688: Merge "Correctly align mSendBuffer to contain doubles."

* commit '0687cec66d6fe9e7524685476aa6e096cde28c0c':
  Correctly align mSendBuffer to contain doubles.
diff --git a/Android.mk b/Android.mk
index 0dde1a4..5676462 100644
--- a/Android.mk
+++ b/Android.mk
@@ -2,14 +2,12 @@
 LOCAL_PATH:=$(call my-dir)
 
 rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable
-ifeq ($(ARCH_ARM_HAVE_NEON), true)
-  rs_base_CFLAGS += -DARCH_ARM_HAVE_NEON
-endif
 ifeq ($(TARGET_BUILD_PDK), true)
   rs_base_CFLAGS += -D__RS_PDK__
 endif
 
 include $(CLEAR_VARS)
+LOCAL_CLANG := true
 LOCAL_MODULE := libRSDriver
 
 LOCAL_SRC_FILES:= \
@@ -19,6 +17,9 @@
 	driver/rsdFrameBuffer.cpp \
 	driver/rsdFrameBufferObj.cpp \
 	driver/rsdGL.cpp \
+	driver/rsdIntrinsics.cpp \
+	driver/rsdIntrinsicConvolve3x3.cpp \
+	driver/rsdIntrinsicColorMatrix.cpp \
 	driver/rsdMesh.cpp \
 	driver/rsdMeshObj.cpp \
 	driver/rsdPath.cpp \
@@ -32,8 +33,14 @@
 	driver/rsdShaderCache.cpp \
 	driver/rsdVertexArray.cpp
 
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+    LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
+    LOCAL_SRC_FILES+= \
+        driver/rsdIntrinsics_Convolve.S
+endif
+
 LOCAL_SHARED_LIBRARIES += libcutils libutils libEGL libGLESv1_CM libGLESv2
-LOCAL_SHARED_LIBRARIES += libbcc libbcinfo libgui
+LOCAL_SHARED_LIBRARIES += libbcc libbcinfo libgui libsync
 
 LOCAL_C_INCLUDES += frameworks/compile/libbcc/include
 
@@ -66,6 +73,7 @@
 RSG_GENERATOR:=$(LOCAL_BUILT_MODULE)
 
 include $(CLEAR_VARS)
+LOCAL_CLANG := true
 LOCAL_MODULE := libRS
 
 LOCAL_MODULE_CLASS := SHARED_LIBRARIES
@@ -135,13 +143,14 @@
 	rsScriptC.cpp \
 	rsScriptC_Lib.cpp \
 	rsScriptC_LibGL.cpp \
+	rsScriptIntrinsic.cpp \
 	rsSignal.cpp \
 	rsStream.cpp \
 	rsThreadIO.cpp \
 	rsType.cpp
 
 LOCAL_SHARED_LIBRARIES += libcutils libutils libEGL libGLESv1_CM libGLESv2 libbcc
-LOCAL_SHARED_LIBRARIES += libui libbcinfo libgui
+LOCAL_SHARED_LIBRARIES += libui libbcinfo libgui libsync
 
 LOCAL_STATIC_LIBRARIES := libft2 libRSDriver
 
@@ -226,6 +235,7 @@
 	rsScriptC.cpp \
 	rsScriptC_Lib.cpp \
 	rsScriptC_LibGL.cpp \
+	rsScriptIntrinsic.cpp \
 	rsSignal.cpp \
 	rsStream.cpp \
 	rsThreadIO.cpp \
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 4eb5a46..8240b10 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -78,6 +78,17 @@
     return 0;
 }
 
+uint8_t *GetOffsetPtr(const android::renderscript::Allocation *alloc,
+                      uint32_t xoff, uint32_t yoff, uint32_t lod,
+                      RsAllocationCubemapFace face) {
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+    uint8_t *ptr = (uint8_t *)drv->lod[lod].mallocPtr;
+    ptr += face * drv->faceOffset;
+    ptr += yoff * drv->lod[lod].stride;
+    ptr += xoff * alloc->mHal.state.elementSizeBytes;
+    return ptr;
+}
+
 
 static void Update2DTexture(const Context *rsc, const Allocation *alloc, const void *ptr,
                             uint32_t xoff, uint32_t yoff, uint32_t lod,
@@ -109,8 +120,7 @@
     rsdGLCheckError(rsc, "Upload2DTexture 1 ");
     for (uint32_t face = 0; face < faceCount; face ++) {
         for (uint32_t lod = 0; lod < alloc->mHal.state.type->getLODCount(); lod++) {
-            const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.mallocPtr;
-            p += alloc->mHal.state.type->getLODFaceOffset(lod, (RsAllocationCubemapFace)face, 0, 0);
+            const uint8_t *p = GetOffsetPtr(alloc, 0, 0, lod, (RsAllocationCubemapFace)face);
 
             GLenum t = GL_TEXTURE_2D;
             if (alloc->mHal.state.hasFaces) {
@@ -151,7 +161,7 @@
         return;
     }
 
-    if (!alloc->getPtr()) {
+    if (!drv->lod[0].mallocPtr) {
         return;
     }
 
@@ -165,9 +175,10 @@
     Upload2DTexture(rsc, alloc, isFirstUpload);
 
     if (!(alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_SCRIPT)) {
-        if (alloc->mHal.drvState.mallocPtr) {
-            free(alloc->mHal.drvState.mallocPtr);
-            alloc->mHal.drvState.mallocPtr = NULL;
+        if (alloc->mHal.drvState.mallocPtrLOD0) {
+            free(alloc->mHal.drvState.mallocPtrLOD0);
+            alloc->mHal.drvState.mallocPtrLOD0 = NULL;
+            drv->lod[0].mallocPtr = NULL;
         }
     }
     rsdGLCheckError(rsc, "UploadToTexture");
@@ -214,26 +225,85 @@
     }
     RSD_CALL_GL(glBindBuffer, drv->glTarget, drv->bufferID);
     RSD_CALL_GL(glBufferData, drv->glTarget, alloc->mHal.state.type->getSizeBytes(),
-                 alloc->mHal.drvState.mallocPtr, GL_DYNAMIC_DRAW);
+                 alloc->mHal.drvState.mallocPtrLOD0, GL_DYNAMIC_DRAW);
     RSD_CALL_GL(glBindBuffer, drv->glTarget, 0);
     rsdGLCheckError(rsc, "UploadToBufferObject");
 }
 
+static size_t AllocationBuildPointerTable(const Context *rsc, const Allocation *alloc,
+        const Type *type, uint8_t *ptr) {
+
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+
+    drv->lod[0].dimX = type->getDimX();
+    drv->lod[0].dimY = type->getDimY();
+    drv->lod[0].mallocPtr = 0;
+    drv->lod[0].stride = drv->lod[0].dimX * type->getElementSizeBytes();
+    drv->lodCount = type->getLODCount();
+    drv->faceCount = type->getDimFaces();
+
+    size_t offsets[Allocation::MAX_LOD];
+    memset(offsets, 0, sizeof(offsets));
+
+    size_t o = drv->lod[0].stride * rsMax(drv->lod[0].dimY, 1u) * rsMax(drv->lod[0].dimZ, 1u);
+    if(drv->lodCount > 1) {
+        uint32_t tx = drv->lod[0].dimX;
+        uint32_t ty = drv->lod[0].dimY;
+        uint32_t tz = drv->lod[0].dimZ;
+        for (uint32_t lod=1; lod < drv->lodCount; lod++) {
+            drv->lod[lod].dimX = tx;
+            drv->lod[lod].dimY = ty;
+            drv->lod[lod].dimZ = tz;
+            drv->lod[lod].stride = tx * type->getElementSizeBytes();
+            offsets[lod] = o;
+            o += drv->lod[lod].stride * rsMax(ty, 1u) * rsMax(tz, 1u);
+            if (tx > 1) tx >>= 1;
+            if (ty > 1) ty >>= 1;
+            if (tz > 1) tz >>= 1;
+        }
+    }
+    drv->faceOffset = o;
+
+    drv->lod[0].mallocPtr = ptr;
+    for (uint32_t lod=1; lod < drv->lodCount; lod++) {
+        drv->lod[lod].mallocPtr = ptr + offsets[lod];
+    }
+    alloc->mHal.drvState.strideLOD0 = drv->lod[0].stride;
+    alloc->mHal.drvState.mallocPtrLOD0 = ptr;
+
+    size_t allocSize = drv->faceOffset;
+    if(drv->faceCount) {
+        allocSize *= 6;
+    }
+
+    return allocSize;
+}
+
 bool rsdAllocationInit(const Context *rsc, Allocation *alloc, bool forceZero) {
     DrvAllocation *drv = (DrvAllocation *)calloc(1, sizeof(DrvAllocation));
     if (!drv) {
         return false;
     }
+    alloc->mHal.drv = drv;
 
-    void * ptr = NULL;
+    // Calculate the object size.
+    size_t allocSize = AllocationBuildPointerTable(rsc, alloc, alloc->getType(), NULL);
+
+    uint8_t * ptr = NULL;
     if (alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_IO_OUTPUT) {
     } else {
-        ptr = malloc(alloc->mHal.state.type->getSizeBytes());
+
+        ptr = (uint8_t *)malloc(allocSize);
         if (!ptr) {
             free(drv);
             return false;
         }
     }
+    // Build the pointer tables
+    size_t verifySize = AllocationBuildPointerTable(rsc, alloc, alloc->getType(), ptr);
+    if(allocSize != verifySize) {
+        rsAssert(!"Size mismatch");
+    }
 
     drv->glTarget = GL_NONE;
     if (alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE) {
@@ -251,10 +321,6 @@
     drv->glType = rsdTypeToGLType(alloc->mHal.state.type->getElement()->getComponent().getType());
     drv->glFormat = rsdKindToGLFormat(alloc->mHal.state.type->getElement()->getComponent().getKind());
 
-
-    alloc->mHal.drvState.mallocPtr = ptr;
-    alloc->mHal.drvState.stride = alloc->mHal.state.dimensionX * alloc->mHal.state.elementSizeBytes;
-    alloc->mHal.drv = drv;
     if (forceZero && ptr) {
         memset(ptr, 0, alloc->mHal.state.type->getSizeBytes());
     }
@@ -263,8 +329,6 @@
         drv->uploadDeferred = true;
     }
 
-    drv->width = alloc->getType()->getDimX();
-    drv->height = alloc->getType()->getDimY();
 
     drv->readBackFBO = NULL;
 
@@ -289,9 +353,9 @@
         drv->renderTargetID = 0;
     }
 
-    if (alloc->mHal.drvState.mallocPtr) {
-        free(alloc->mHal.drvState.mallocPtr);
-        alloc->mHal.drvState.mallocPtr = NULL;
+    if (alloc->mHal.drvState.mallocPtrLOD0) {
+        free(alloc->mHal.drvState.mallocPtrLOD0);
+        alloc->mHal.drvState.mallocPtrLOD0 = NULL;
     }
     if (drv->readBackFBO != NULL) {
         delete drv->readBackFBO;
@@ -305,16 +369,22 @@
                          const Type *newType, bool zeroNew) {
     DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
 
-    alloc->mHal.drvState.mallocPtr = (uint8_t *)realloc(
-            alloc->mHal.drvState.mallocPtr, newType->getSizeBytes());
+    void * oldPtr = drv->lod[0].mallocPtr;
+    // Calculate the object size
+    size_t s = AllocationBuildPointerTable(rsc, alloc, newType, NULL);
+    uint8_t *ptr = (uint8_t *)realloc(oldPtr, s);
+    // Build the relative pointer tables.
+    size_t verifySize = AllocationBuildPointerTable(rsc, alloc, newType, ptr);
+    if(s != verifySize) {
+        rsAssert(!"Size mismatch");
+    }
 
     const uint32_t oldDimX = alloc->mHal.state.dimensionX;
     const uint32_t dimX = newType->getDimX();
 
     if (dimX > oldDimX) {
-        const Element *e = alloc->mHal.state.type->getElement();
-        uint32_t stride = e->getSizeBytes();
-        memset(((uint8_t *)alloc->mHal.drvState.mallocPtr) + stride * oldDimX,
+        uint32_t stride = alloc->mHal.state.elementSizeBytes;
+        memset(((uint8_t *)alloc->mHal.drvState.mallocPtrLOD0) + stride * oldDimX,
                  0, stride * (dimX - oldDimX));
     }
 }
@@ -342,8 +412,8 @@
     drv->readBackFBO->setActive(rsc);
 
     // Do the readback
-    RSD_CALL_GL(glReadPixels, 0, 0, alloc->getType()->getDimX(), alloc->getType()->getDimY(),
-                 drv->glFormat, drv->glType, alloc->getPtr());
+    RSD_CALL_GL(glReadPixels, 0, 0, drv->lod[0].dimX, drv->lod[0].dimY,
+                drv->glFormat, drv->glType, drv->lod[0].mallocPtr);
 
     // Revert framebuffer to its original
     lastFbo->setActive(rsc);
@@ -399,19 +469,12 @@
 static bool IoGetBuffer(const Context *rsc, Allocation *alloc, ANativeWindow *nw) {
     DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
 
-    int32_t r = nw->dequeueBuffer(nw, &drv->wndBuffer);
+    int32_t r = native_window_dequeue_buffer_and_wait(nw, &drv->wndBuffer);
     if (r) {
         rsc->setError(RS_ERROR_DRIVER, "Error getting next IO output buffer.");
         return false;
     }
 
-    // This lock is implicitly released by the queue buffer in IoSend
-    r = nw->lockBuffer(nw, drv->wndBuffer);
-    if (r) {
-        rsc->setError(RS_ERROR_DRIVER, "Error locking next IO output buffer.");
-        return false;
-    }
-
     // Must lock the whole surface
     GraphicBufferMapper &mapper = GraphicBufferMapper::get();
     Rect bounds(drv->wndBuffer->width, drv->wndBuffer->height);
@@ -420,8 +483,9 @@
     mapper.lock(drv->wndBuffer->handle,
             GRALLOC_USAGE_SW_READ_NEVER | GRALLOC_USAGE_SW_WRITE_OFTEN,
             bounds, &dst);
-    alloc->mHal.drvState.mallocPtr = dst;
-    alloc->mHal.drvState.stride = drv->wndBuffer->stride * alloc->mHal.state.elementSizeBytes;
+    drv->lod[0].mallocPtr = dst;
+    alloc->mHal.drvState.mallocPtrLOD0 = dst;
+    drv->lod[0].stride = drv->wndBuffer->stride * alloc->mHal.state.elementSizeBytes;
 
     return true;
 }
@@ -443,7 +507,7 @@
         ANativeWindow *old = alloc->mHal.state.wndSurface;
         GraphicBufferMapper &mapper = GraphicBufferMapper::get();
         mapper.unlock(drv->wndBuffer->handle);
-        old->queueBuffer(old, drv->wndBuffer);
+        old->queueBuffer(old, drv->wndBuffer, -1);
     }
 
     if (nw != NULL) {
@@ -492,7 +556,7 @@
     if (alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_SCRIPT) {
         GraphicBufferMapper &mapper = GraphicBufferMapper::get();
         mapper.unlock(drv->wndBuffer->handle);
-        int32_t r = nw->queueBuffer(nw, drv->wndBuffer);
+        int32_t r = nw->queueBuffer(nw, drv->wndBuffer, -1);
         if (r) {
             rsc->setError(RS_ERROR_DRIVER, "Error sending IO output buffer.");
             return;
@@ -514,8 +578,7 @@
     DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
 
     const uint32_t eSize = alloc->mHal.state.type->getElementSizeBytes();
-    uint8_t * ptr = (uint8_t *)alloc->mHal.drvState.mallocPtr;
-    ptr += eSize * xoff;
+    uint8_t * ptr = GetOffsetPtr(alloc, xoff, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
     uint32_t size = count * eSize;
 
     if (alloc->mHal.state.hasReferences) {
@@ -535,10 +598,9 @@
     uint32_t eSize = alloc->mHal.state.elementSizeBytes;
     uint32_t lineSize = eSize * w;
 
-    if (alloc->mHal.drvState.mallocPtr) {
+    if (drv->lod[0].mallocPtr) {
         const uint8_t *src = static_cast<const uint8_t *>(data);
-        uint8_t *dst = (uint8_t *)alloc->mHal.drvState.mallocPtr;
-        dst += alloc->mHal.state.type->getLODFaceOffset(lod, face, xoff, yoff);
+        uint8_t *dst = GetOffsetPtr(alloc, xoff, yoff, lod, face);
 
         for (uint32_t line=yoff; line < (yoff+h); line++) {
             if (alloc->mHal.state.hasReferences) {
@@ -547,7 +609,7 @@
             }
             memcpy(dst, src, lineSize);
             src += lineSize;
-            dst += alloc->mHal.drvState.stride;
+            dst += drv->lod[lod].stride;
         }
         drv->uploadDeferred = true;
     } else {
@@ -562,6 +624,56 @@
 
 }
 
+void rsdAllocationRead1D(const Context *rsc, const Allocation *alloc,
+                         uint32_t xoff, uint32_t lod, uint32_t count,
+                         void *data, size_t sizeBytes) {
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+
+    const uint32_t eSize = alloc->mHal.state.type->getElementSizeBytes();
+    const uint8_t * ptr = GetOffsetPtr(alloc, xoff, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
+    memcpy(data, ptr, count * eSize);
+}
+
+void rsdAllocationRead2D(const Context *rsc, const Allocation *alloc,
+                         uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+                         uint32_t w, uint32_t h, void *data, size_t sizeBytes) {
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+
+    uint32_t eSize = alloc->mHal.state.elementSizeBytes;
+    uint32_t lineSize = eSize * w;
+
+    if (drv->lod[0].mallocPtr) {
+        uint8_t *dst = static_cast<uint8_t *>(data);
+        const uint8_t *src = GetOffsetPtr(alloc, xoff, yoff, lod, face);
+
+        for (uint32_t line=yoff; line < (yoff+h); line++) {
+            memcpy(dst, src, lineSize);
+            dst += lineSize;
+            src += drv->lod[lod].stride;
+        }
+    } else {
+        ALOGE("Add code to readback from non-script memory");
+    }
+}
+
+void rsdAllocationRead3D(const Context *rsc, const Allocation *alloc,
+                         uint32_t xoff, uint32_t yoff, uint32_t zoff,
+                         uint32_t lod, RsAllocationCubemapFace face,
+                         uint32_t w, uint32_t h, uint32_t d, void *data, uint32_t sizeBytes) {
+
+}
+
+void * rsdAllocationLock1D(const android::renderscript::Context *rsc,
+                          const android::renderscript::Allocation *alloc) {
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+    return drv->lod[0].mallocPtr;
+}
+
+void rsdAllocationUnlock1D(const android::renderscript::Context *rsc,
+                          const android::renderscript::Allocation *alloc) {
+
+}
+
 void rsdAllocationData1D_alloc(const android::renderscript::Context *rsc,
                                const android::renderscript::Allocation *dstAlloc,
                                uint32_t dstXoff, uint32_t dstLod, uint32_t count,
@@ -569,20 +681,6 @@
                                uint32_t srcXoff, uint32_t srcLod) {
 }
 
-uint8_t *getOffsetPtr(const android::renderscript::Allocation *alloc,
-                      uint32_t xoff, uint32_t yoff, uint32_t lod,
-                      RsAllocationCubemapFace face) {
-    uint8_t *ptr = static_cast<uint8_t *>(alloc->getPtr());
-    ptr += alloc->getType()->getLODOffset(lod, xoff, yoff);
-
-    if (face != 0) {
-        uint32_t totalSizeBytes = alloc->getType()->getSizeBytes();
-        uint32_t faceOffset = totalSizeBytes / 6;
-        ptr += faceOffset * (uint32_t)face;
-    }
-    return ptr;
-}
-
 
 void rsdAllocationData2D_alloc_script(const android::renderscript::Context *rsc,
                                       const android::renderscript::Allocation *dstAlloc,
@@ -593,8 +691,8 @@
                                       RsAllocationCubemapFace srcFace) {
     uint32_t elementSize = dstAlloc->getType()->getElementSizeBytes();
     for (uint32_t i = 0; i < h; i ++) {
-        uint8_t *dstPtr = getOffsetPtr(dstAlloc, dstXoff, dstYoff + i, dstLod, dstFace);
-        uint8_t *srcPtr = getOffsetPtr(srcAlloc, srcXoff, srcYoff + i, srcLod, srcFace);
+        uint8_t *dstPtr = GetOffsetPtr(dstAlloc, dstXoff, dstYoff + i, dstLod, dstFace);
+        uint8_t *srcPtr = GetOffsetPtr(srcAlloc, srcXoff, srcYoff + i, srcLod, srcFace);
         memcpy(dstPtr, srcPtr, w * elementSize);
 
         //ALOGE("COPIED dstXoff(%u), dstYoff(%u), dstLod(%u), dstFace(%u), w(%u), h(%u), srcXoff(%u), srcYoff(%u), srcLod(%u), srcFace(%u)",
@@ -635,8 +733,7 @@
     DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
 
     uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    uint8_t * ptr = (uint8_t *)alloc->mHal.drvState.mallocPtr;
-    ptr += eSize * x;
+    uint8_t * ptr = GetOffsetPtr(alloc, x, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
 
     const Element * e = alloc->mHal.state.type->getElement()->getField(cIdx);
     ptr += alloc->mHal.state.type->getElement()->getFieldOffsetBytes(cIdx);
@@ -656,8 +753,7 @@
     DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
 
     uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    uint8_t * ptr = (uint8_t *)alloc->mHal.drvState.mallocPtr;
-    ptr += (eSize * x) + (y * alloc->mHal.drvState.stride);
+    uint8_t * ptr = GetOffsetPtr(alloc, x, y, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
 
     const Element * e = alloc->mHal.state.type->getElement()->getField(cIdx);
     ptr += alloc->mHal.state.type->getElement()->getFieldOffsetBytes(cIdx);
@@ -671,4 +767,84 @@
     drv->uploadDeferred = true;
 }
 
+static void mip565(const Allocation *alloc, int lod, RsAllocationCubemapFace face) {
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+    uint32_t w = drv->lod[lod + 1].dimX;
+    uint32_t h = drv->lod[lod + 1].dimY;
+
+    for (uint32_t y=0; y < h; y++) {
+        uint16_t *oPtr = (uint16_t *)GetOffsetPtr(alloc, 0, y, lod + 1, face);
+        const uint16_t *i1 = (uint16_t *)GetOffsetPtr(alloc, 0, y*2, lod, face);
+        const uint16_t *i2 = (uint16_t *)GetOffsetPtr(alloc, 0, y*2+1, lod, face);
+
+        for (uint32_t x=0; x < w; x++) {
+            *oPtr = rsBoxFilter565(i1[0], i1[1], i2[0], i2[1]);
+            oPtr ++;
+            i1 += 2;
+            i2 += 2;
+        }
+    }
+}
+
+static void mip8888(const Allocation *alloc, int lod, RsAllocationCubemapFace face) {
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+    uint32_t w = drv->lod[lod + 1].dimX;
+    uint32_t h = drv->lod[lod + 1].dimY;
+
+    for (uint32_t y=0; y < h; y++) {
+        uint32_t *oPtr = (uint32_t *)GetOffsetPtr(alloc, 0, y, lod + 1, face);
+        const uint32_t *i1 = (uint32_t *)GetOffsetPtr(alloc, 0, y*2, lod, face);
+        const uint32_t *i2 = (uint32_t *)GetOffsetPtr(alloc, 0, y*2+1, lod, face);
+
+        for (uint32_t x=0; x < w; x++) {
+            *oPtr = rsBoxFilter8888(i1[0], i1[1], i2[0], i2[1]);
+            oPtr ++;
+            i1 += 2;
+            i2 += 2;
+        }
+    }
+}
+
+static void mip8(const Allocation *alloc, int lod, RsAllocationCubemapFace face) {
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+    uint32_t w = drv->lod[lod + 1].dimX;
+    uint32_t h = drv->lod[lod + 1].dimY;
+
+    for (uint32_t y=0; y < h; y++) {
+        uint8_t *oPtr = GetOffsetPtr(alloc, 0, y, lod + 1, face);
+        const uint8_t *i1 = GetOffsetPtr(alloc, 0, y*2, lod, face);
+        const uint8_t *i2 = GetOffsetPtr(alloc, 0, y*2+1, lod, face);
+
+        for (uint32_t x=0; x < w; x++) {
+            *oPtr = (uint8_t)(((uint32_t)i1[0] + i1[1] + i2[0] + i2[1]) * 0.25f);
+            oPtr ++;
+            i1 += 2;
+            i2 += 2;
+        }
+    }
+}
+
+void rsdAllocationGenerateMipmaps(const Context *rsc, const Allocation *alloc) {
+    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
+    if(!drv->lod[0].mallocPtr) {
+        return;
+    }
+    uint32_t numFaces = alloc->getType()->getDimFaces() ? 6 : 1;
+    for (uint32_t face = 0; face < numFaces; face ++) {
+        for (uint32_t lod=0; lod < (alloc->getType()->getLODCount() -1); lod++) {
+            switch (alloc->getType()->getElement()->getSizeBits()) {
+            case 32:
+                mip8888(alloc, lod, (RsAllocationCubemapFace)face);
+                break;
+            case 16:
+                mip565(alloc, lod, (RsAllocationCubemapFace)face);
+                break;
+            case 8:
+                mip8(alloc, lod, (RsAllocationCubemapFace)face);
+                break;
+            }
+        }
+    }
+}
+
 
diff --git a/driver/rsdAllocation.h b/driver/rsdAllocation.h
index 0b196a1..e6488b9 100644
--- a/driver/rsdAllocation.h
+++ b/driver/rsdAllocation.h
@@ -19,6 +19,7 @@
 
 #include <rs_hal.h>
 #include <rsRuntime.h>
+#include <rsAllocation.h>
 
 #include <GLES/gl.h>
 #include <GLES2/gl2.h>
@@ -39,9 +40,6 @@
     // Is this a legal structure to be used as an FBO render target
     uint32_t renderTargetID;
 
-    uint32_t width;
-    uint32_t height;
-
     GLenum glTarget;
     GLenum glType;
     GLenum glFormat;
@@ -51,6 +49,19 @@
     RsdFrameBufferObj * readBackFBO;
     ANativeWindow *wnd;
     ANativeWindowBuffer *wndBuffer;
+
+    struct LodState {
+        void * mallocPtr;
+        size_t stride;
+        uint32_t dimX;
+        uint32_t dimY;
+        uint32_t dimZ;
+    } lod[android::renderscript::Allocation::MAX_LOD];
+    size_t faceOffset;
+    uint32_t lodCount;
+    uint32_t faceCount;
+
+
 };
 
 GLenum rsdTypeToGLType(RsDataType t);
@@ -95,6 +106,27 @@
                          uint32_t lod, RsAllocationCubemapFace face,
                          uint32_t w, uint32_t h, uint32_t d, const void *data, uint32_t sizeBytes);
 
+void rsdAllocationRead1D(const android::renderscript::Context *rsc,
+                         const android::renderscript::Allocation *alloc,
+                         uint32_t xoff, uint32_t lod, uint32_t count,
+                         void *data, uint32_t sizeBytes);
+void rsdAllocationRead2D(const android::renderscript::Context *rsc,
+                         const android::renderscript::Allocation *alloc,
+                         uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+                         uint32_t w, uint32_t h,
+                         void *data, uint32_t sizeBytes);
+void rsdAllocationRead3D(const android::renderscript::Context *rsc,
+                         const android::renderscript::Allocation *alloc,
+                         uint32_t xoff, uint32_t yoff, uint32_t zoff,
+                         uint32_t lod, RsAllocationCubemapFace face,
+                         uint32_t w, uint32_t h, uint32_t d, void *data, uint32_t sizeBytes);
+
+void * rsdAllocationLock1D(const android::renderscript::Context *rsc,
+                          const android::renderscript::Allocation *alloc);
+void rsdAllocationUnlock1D(const android::renderscript::Context *rsc,
+                          const android::renderscript::Allocation *alloc);
+
+
 void rsdAllocationData1D_alloc(const android::renderscript::Context *rsc,
                                const android::renderscript::Allocation *dstAlloc,
                                uint32_t dstXoff, uint32_t dstLod, uint32_t count,
@@ -125,6 +157,8 @@
                                 uint32_t x, uint32_t y,
                                 const void *data, uint32_t elementOff, uint32_t sizeBytes);
 
+void rsdAllocationGenerateMipmaps(const android::renderscript::Context *rsc,
+                                  const android::renderscript::Allocation *alloc);
 
 
 
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 457a99d..35d6f3b 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -17,13 +17,19 @@
 #include "rsdCore.h"
 #include "rsdBcc.h"
 #include "rsdRuntime.h"
+#include "rsdAllocation.h"
+#include "rsdIntrinsics.h"
 
-#include <bcinfo/MetadataExtractor.h>
+#include <bcc/BCCContext.h>
+#include <bcc/Renderscript/RSCompilerDriver.h>
+#include <bcc/Renderscript/RSExecutable.h>
+#include <bcc/Renderscript/RSInfo.h>
 
 #include "rsContext.h"
 #include "rsElement.h"
 #include "rsScriptC.h"
 
+#include "utils/Vector.h"
 #include "utils/Timers.h"
 #include "utils/StopWatch.h"
 
@@ -31,23 +37,19 @@
 using namespace android::renderscript;
 
 struct DrvScript {
+    RsScriptIntrinsicID mIntrinsicID;
     int (*mRoot)();
     int (*mRootExpand)();
     void (*mInit)();
     void (*mFreeChildren)();
 
-    BCCScriptRef mBccScript;
+    bcc::BCCContext *mCompilerContext;
+    bcc::RSCompilerDriver *mCompilerDriver;
+    bcc::RSExecutable *mExecutable;
 
-    bcinfo::MetadataExtractor *ME;
-
-    InvokeFunc_t *mInvokeFunctions;
-    ForEachFunc_t *mForEachFunctions;
-    void ** mFieldAddress;
-    bool * mFieldIsObject;
-    const uint32_t *mExportForEachSignatureList;
-
-    const uint8_t * mScriptText;
-    uint32_t mScriptTextLength;
+    Allocation **mBoundAllocs;
+    RsdIntriniscFuncs_t mIntrinsicFuncs;
+    void * mIntrinsicData;
 };
 
 typedef void (*outer_foreach_t)(
@@ -72,127 +74,81 @@
                      size_t bitcodeSize,
                      uint32_t flags) {
     //ALOGE("rsdScriptCreate %p %p %p %p %i %i %p", rsc, resName, cacheDir, bitcode, bitcodeSize, flags, lookupFunc);
+    //ALOGE("rsdScriptInit %p %p", rsc, script);
 
     pthread_mutex_lock(&rsdgInitMutex);
 
-    size_t exportFuncCount = 0;
-    size_t exportVarCount = 0;
-    size_t objectSlotCount = 0;
-    size_t exportForEachSignatureCount = 0;
-
-    const char* coreLib = "/system/lib/libclcore.bc";
-    bcinfo::RSFloatPrecision prec;
-
+    bcc::RSExecutable *exec;
+    const bcc::RSInfo *info;
     DrvScript *drv = (DrvScript *)calloc(1, sizeof(DrvScript));
     if (drv == NULL) {
         goto error;
     }
     script->mHal.drv = drv;
 
-    drv->mBccScript = bccCreateScript();
+    drv->mCompilerContext = NULL;
+    drv->mCompilerDriver = NULL;
+    drv->mExecutable = NULL;
+
+    drv->mCompilerContext = new bcc::BCCContext();
+    if (drv->mCompilerContext == NULL) {
+        ALOGE("bcc: FAILS to create compiler context (out of memory)");
+        goto error;
+    }
+
+    drv->mCompilerDriver = new bcc::RSCompilerDriver();
+    if (drv->mCompilerDriver == NULL) {
+        ALOGE("bcc: FAILS to create compiler driver (out of memory)");
+        goto error;
+    }
+
     script->mHal.info.isThreadable = true;
-    drv->mScriptText = bitcode;
-    drv->mScriptTextLength = bitcodeSize;
 
+    drv->mCompilerDriver->setRSRuntimeLookupFunction(rsdLookupRuntimeStub);
+    drv->mCompilerDriver->setRSRuntimeLookupContext(script);
 
-    drv->ME = new bcinfo::MetadataExtractor((const char*)drv->mScriptText,
-                                            drv->mScriptTextLength);
-    if (!drv->ME->extract()) {
-      ALOGE("bcinfo: failed to read script metadata");
-      goto error;
-    }
+    exec = drv->mCompilerDriver->build(*drv->mCompilerContext,
+                                       cacheDir, resName,
+                                       (const char *)bitcode, bitcodeSize);
 
-    //ALOGE("mBccScript %p", script->mBccScript);
-
-    if (bccRegisterSymbolCallback(drv->mBccScript, &rsdLookupRuntimeStub, script) != 0) {
-        ALOGE("bcc: FAILS to register symbol callback");
+    if (exec == NULL) {
+        ALOGE("bcc: FAILS to prepare executable for '%s'", resName);
         goto error;
     }
 
-    if (bccReadBC(drv->mBccScript,
-                  resName,
-                  (char const *)drv->mScriptText,
-                  drv->mScriptTextLength, 0) != 0) {
-        ALOGE("bcc: FAILS to read bitcode");
-        goto error;
+    drv->mExecutable = exec;
+
+    exec->setThreadable(script->mHal.info.isThreadable);
+    if (!exec->syncInfo()) {
+        ALOGW("bcc: FAILS to synchronize the RS info file to the disk");
     }
 
-    // NEON-capable devices can use an accelerated math library for all
-    // reduced precision scripts.
-#if defined(ARCH_ARM_HAVE_NEON)
-    prec = drv->ME->getRSFloatPrecision();
-    if (prec != bcinfo::RS_FP_Full) {
-        coreLib = "/system/lib/libclcore_neon.bc";
-    }
-#endif
+    drv->mRoot = reinterpret_cast<int (*)()>(exec->getSymbolAddress("root"));
+    drv->mRootExpand =
+        reinterpret_cast<int (*)()>(exec->getSymbolAddress("root.expand"));
+    drv->mInit = reinterpret_cast<void (*)()>(exec->getSymbolAddress("init"));
+    drv->mFreeChildren =
+        reinterpret_cast<void (*)()>(exec->getSymbolAddress(".rs.dtor"));
 
-    if (bccLinkFile(drv->mBccScript, coreLib, 0) != 0) {
-        ALOGE("bcc: FAILS to link bitcode");
-        goto error;
-    }
-
-    if (bccPrepareExecutable(drv->mBccScript, cacheDir, resName, 0) != 0) {
-        ALOGE("bcc: FAILS to prepare executable");
-        goto error;
-    }
-
-    drv->mRoot = reinterpret_cast<int (*)()>(bccGetFuncAddr(drv->mBccScript, "root"));
-    drv->mRootExpand = reinterpret_cast<int (*)()>(bccGetFuncAddr(drv->mBccScript, "root.expand"));
-    drv->mInit = reinterpret_cast<void (*)()>(bccGetFuncAddr(drv->mBccScript, "init"));
-    drv->mFreeChildren = reinterpret_cast<void (*)()>(bccGetFuncAddr(drv->mBccScript, ".rs.dtor"));
-
-    exportFuncCount = drv->ME->getExportFuncCount();
-    if (exportFuncCount > 0) {
-        drv->mInvokeFunctions = (InvokeFunc_t*) calloc(exportFuncCount,
-                                                       sizeof(InvokeFunc_t));
-        bccGetExportFuncList(drv->mBccScript, exportFuncCount,
-                             (void **) drv->mInvokeFunctions);
-    } else {
-        drv->mInvokeFunctions = NULL;
-    }
-
-    exportVarCount = drv->ME->getExportVarCount();
-    if (exportVarCount > 0) {
-        drv->mFieldAddress = (void **) calloc(exportVarCount, sizeof(void*));
-        drv->mFieldIsObject = (bool *) calloc(exportVarCount, sizeof(bool));
-        bccGetExportVarList(drv->mBccScript, exportVarCount,
-                            (void **) drv->mFieldAddress);
-    } else {
-        drv->mFieldAddress = NULL;
-        drv->mFieldIsObject = NULL;
-    }
-
-    objectSlotCount = drv->ME->getObjectSlotCount();
-    if (objectSlotCount > 0) {
-        const uint32_t *objectSlotList = drv->ME->getObjectSlotList();
-        for (uint32_t ct=0; ct < objectSlotCount; ct++) {
-            drv->mFieldIsObject[objectSlotList[ct]] = true;
-        }
-    }
-
-    exportForEachSignatureCount = drv->ME->getExportForEachSignatureCount();
-    drv->mExportForEachSignatureList = drv->ME->getExportForEachSignatureList();
-    if (exportForEachSignatureCount > 0) {
-        drv->mForEachFunctions =
-            (ForEachFunc_t*) calloc(exportForEachSignatureCount,
-                                    sizeof(ForEachFunc_t));
-        bccGetExportForEachList(drv->mBccScript, exportForEachSignatureCount,
-                                (void **) drv->mForEachFunctions);
-    } else {
-        drv->mForEachFunctions = NULL;
-    }
-
+    info = &drv->mExecutable->getInfo();
     // Copy info over to runtime
-    script->mHal.info.exportedFunctionCount = drv->ME->getExportFuncCount();
-    script->mHal.info.exportedVariableCount = drv->ME->getExportVarCount();
-    script->mHal.info.exportedPragmaCount = drv->ME->getPragmaCount();
-    script->mHal.info.exportedPragmaKeyList = drv->ME->getPragmaKeyList();
-    script->mHal.info.exportedPragmaValueList = drv->ME->getPragmaValueList();
+    script->mHal.info.exportedFunctionCount = info->getExportFuncNames().size();
+    script->mHal.info.exportedVariableCount = info->getExportVarNames().size();
+    script->mHal.info.exportedPragmaCount = info->getPragmas().size();
+    script->mHal.info.exportedPragmaKeyList =
+        const_cast<const char**>(exec->getPragmaKeys().array());
+    script->mHal.info.exportedPragmaValueList =
+        const_cast<const char**>(exec->getPragmaValues().array());
 
     if (drv->mRootExpand) {
-      script->mHal.info.root = drv->mRootExpand;
+        script->mHal.info.root = drv->mRootExpand;
     } else {
-      script->mHal.info.root = drv->mRoot;
+        script->mHal.info.root = drv->mRoot;
+    }
+
+    if (script->mHal.info.exportedVariableCount) {
+        drv->mBoundAllocs = new Allocation *[script->mHal.info.exportedVariableCount];
+        memset(drv->mBoundAllocs, 0, sizeof(void *) * script->mHal.info.exportedVariableCount);
     }
 
     pthread_mutex_unlock(&rsdgInitMutex);
@@ -201,36 +157,50 @@
 error:
 
     pthread_mutex_unlock(&rsdgInitMutex);
-    if (drv->ME) {
-        delete drv->ME;
-        drv->ME = NULL;
+    if (drv) {
+        delete drv->mCompilerContext;
+        delete drv->mCompilerDriver;
+        delete drv->mExecutable;
+        delete[] drv->mBoundAllocs;
+        free(drv);
     }
-    free(drv);
+    script->mHal.drv = NULL;
     return false;
 
 }
 
+bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid, Element *e) {
+    pthread_mutex_lock(&rsdgInitMutex);
+
+    DrvScript *drv = (DrvScript *)calloc(1, sizeof(DrvScript));
+    if (drv == NULL) {
+        goto error;
+    }
+    s->mHal.drv = drv;
+    drv->mIntrinsicID = iid;
+    drv->mIntrinsicData = rsdIntrinsic_Init(rsc, s, iid, &drv->mIntrinsicFuncs);
+
+    pthread_mutex_unlock(&rsdgInitMutex);
+    return true;
+
+error:
+    pthread_mutex_unlock(&rsdgInitMutex);
+    return false;
+}
+
 typedef struct {
+    RsForEachStubParamStruct fep;
+
     Context *rsc;
     Script *script;
     ForEachFunc_t kernel;
     uint32_t sig;
     const Allocation * ain;
     Allocation * aout;
-    const void * usr;
-    size_t usrLen;
 
     uint32_t mSliceSize;
     volatile int mSliceNum;
 
-    const uint8_t *ptrIn;
-    uint32_t eStrideIn;
-    uint8_t *ptrOut;
-    uint32_t eStrideOut;
-
-    uint32_t yStrideIn;
-    uint32_t yStrideOut;
-
     uint32_t xStart;
     uint32_t xEnd;
     uint32_t yStart;
@@ -239,20 +209,13 @@
     uint32_t zEnd;
     uint32_t arrayStart;
     uint32_t arrayEnd;
-
-    uint32_t dimX;
-    uint32_t dimY;
-    uint32_t dimZ;
-    uint32_t dimArray;
 } MTLaunchStruct;
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
 static void wc_xy(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
     RsForEachStubParamStruct p;
-    memset(&p, 0, sizeof(p));
-    p.usr = mtls->usr;
-    p.usr_len = mtls->usrLen;
+    memcpy(&p, &mtls->fep, sizeof(p));
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
     uint32_t sig = mtls->sig;
 
@@ -269,9 +232,9 @@
         //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
         //ALOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
         for (p.y = yStart; p.y < yEnd; p.y++) {
-            p.out = mtls->ptrOut + (mtls->yStrideOut * p.y);
-            p.in = mtls->ptrIn + (mtls->yStrideIn * p.y);
-            fn(&p, mtls->xStart, mtls->xEnd, mtls->eStrideIn, mtls->eStrideOut);
+            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y);
+            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y);
+            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
         }
     }
 }
@@ -279,9 +242,7 @@
 static void wc_x(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
     RsForEachStubParamStruct p;
-    memset(&p, 0, sizeof(p));
-    p.usr = mtls->usr;
-    p.usr_len = mtls->usrLen;
+    memcpy(&p, &mtls->fep, sizeof(p));
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
     uint32_t sig = mtls->sig;
 
@@ -298,9 +259,9 @@
         //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
         //ALOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
 
-        p.out = mtls->ptrOut + (mtls->eStrideOut * xStart);
-        p.in = mtls->ptrIn + (mtls->eStrideIn * xStart);
-        fn(&p, xStart, xEnd, mtls->eStrideIn, mtls->eStrideOut);
+        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
+        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
     }
 }
 
@@ -318,22 +279,30 @@
     MTLaunchStruct mtls;
     memset(&mtls, 0, sizeof(mtls));
 
+    //ALOGE("for each script %p  in %p   out %p", s, ain, aout);
+
     DrvScript *drv = (DrvScript *)s->mHal.drv;
-    mtls.kernel = drv->mForEachFunctions[slot];
-    rsAssert(mtls.kernel != NULL);
-    mtls.sig = 0x1f;  // temp fix for old apps, full table in slang_rs_export_foreach.cpp
-    if (drv->mExportForEachSignatureList) {
-        mtls.sig = drv->mExportForEachSignatureList[slot];
+
+    if (drv->mIntrinsicID) {
+        mtls.kernel = (void (*)())drv->mIntrinsicFuncs.root;
+        usr = drv->mIntrinsicData;
+    } else {
+        rsAssert(slot < drv->mExecutable->getExportForeachFuncAddrs().size());
+        mtls.kernel = reinterpret_cast<ForEachFunc_t>(
+                          drv->mExecutable->getExportForeachFuncAddrs()[slot]);
+        rsAssert(mtls.kernel != NULL);
+        mtls.sig = drv->mExecutable->getInfo().getExportForeachFuncs()[slot].second;
     }
+
     if (ain) {
-        mtls.dimX = ain->getType()->getDimX();
-        mtls.dimY = ain->getType()->getDimY();
-        mtls.dimZ = ain->getType()->getDimZ();
+        mtls.fep.dimX = ain->getType()->getDimX();
+        mtls.fep.dimY = ain->getType()->getDimY();
+        mtls.fep.dimZ = ain->getType()->getDimZ();
         //mtls.dimArray = ain->getType()->getDimArray();
     } else if (aout) {
-        mtls.dimX = aout->getType()->getDimX();
-        mtls.dimY = aout->getType()->getDimY();
-        mtls.dimZ = aout->getType()->getDimZ();
+        mtls.fep.dimX = aout->getType()->getDimX();
+        mtls.fep.dimY = aout->getType()->getDimY();
+        mtls.fep.dimZ = aout->getType()->getDimZ();
         //mtls.dimArray = aout->getType()->getDimArray();
     } else {
         rsc->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
@@ -341,24 +310,24 @@
     }
 
     if (!sc || (sc->xEnd == 0)) {
-        mtls.xEnd = mtls.dimX;
+        mtls.xEnd = mtls.fep.dimX;
     } else {
-        rsAssert(sc->xStart < mtls.dimX);
-        rsAssert(sc->xEnd <= mtls.dimX);
+        rsAssert(sc->xStart < mtls.fep.dimX);
+        rsAssert(sc->xEnd <= mtls.fep.dimX);
         rsAssert(sc->xStart < sc->xEnd);
-        mtls.xStart = rsMin(mtls.dimX, sc->xStart);
-        mtls.xEnd = rsMin(mtls.dimX, sc->xEnd);
+        mtls.xStart = rsMin(mtls.fep.dimX, sc->xStart);
+        mtls.xEnd = rsMin(mtls.fep.dimX, sc->xEnd);
         if (mtls.xStart >= mtls.xEnd) return;
     }
 
     if (!sc || (sc->yEnd == 0)) {
-        mtls.yEnd = mtls.dimY;
+        mtls.yEnd = mtls.fep.dimY;
     } else {
-        rsAssert(sc->yStart < mtls.dimY);
-        rsAssert(sc->yEnd <= mtls.dimY);
+        rsAssert(sc->yStart < mtls.fep.dimY);
+        rsAssert(sc->yEnd <= mtls.fep.dimY);
         rsAssert(sc->yStart < sc->yEnd);
-        mtls.yStart = rsMin(mtls.dimY, sc->yStart);
-        mtls.yEnd = rsMin(mtls.dimY, sc->yEnd);
+        mtls.yStart = rsMin(mtls.fep.dimY, sc->yStart);
+        mtls.yEnd = rsMin(mtls.fep.dimY, sc->yEnd);
         if (mtls.yStart >= mtls.yEnd) return;
     }
 
@@ -376,32 +345,45 @@
     mtls.ain = ain;
     mtls.aout = aout;
     mtls.script = s;
-    mtls.usr = usr;
-    mtls.usrLen = usrLen;
+    mtls.fep.usr = usr;
+    mtls.fep.usrLen = usrLen;
     mtls.mSliceSize = 10;
     mtls.mSliceNum = 0;
 
-    mtls.ptrIn = NULL;
-    mtls.eStrideIn = 0;
+    mtls.fep.ptrIn = NULL;
+    mtls.fep.eStrideIn = 0;
     if (ain) {
-        mtls.ptrIn = (const uint8_t *)ain->getPtr();
-        mtls.eStrideIn = ain->getType()->getElementSizeBytes();
-        mtls.yStrideIn = ain->mHal.drvState.stride;
+        DrvAllocation *aindrv = (DrvAllocation *)ain->mHal.drv;
+        mtls.fep.ptrIn = (const uint8_t *)aindrv->lod[0].mallocPtr;
+        mtls.fep.eStrideIn = ain->getType()->getElementSizeBytes();
+        mtls.fep.yStrideIn = aindrv->lod[0].stride;
     }
 
-    mtls.ptrOut = NULL;
-    mtls.eStrideOut = 0;
+    mtls.fep.ptrOut = NULL;
+    mtls.fep.eStrideOut = 0;
     if (aout) {
-        mtls.ptrOut = (uint8_t *)aout->getPtr();
-        mtls.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls.yStrideOut = aout->mHal.drvState.stride;
+        DrvAllocation *aoutdrv = (DrvAllocation *)aout->mHal.drv;
+        mtls.fep.ptrOut = (uint8_t *)aoutdrv->lod[0].mallocPtr;
+        mtls.fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls.fep.yStrideOut = aoutdrv->lod[0].stride;
     }
 
+
     if ((dc->mWorkers.mCount > 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
         dc->mInForEach = true;
-        if (mtls.dimY > 1) {
+        if (mtls.fep.dimY > 1) {
+            mtls.mSliceSize = mtls.fep.dimY / (dc->mWorkers.mCount * 4);
+            if(mtls.mSliceSize < 1) {
+                mtls.mSliceSize = 1;
+            }
+
             rsdLaunchThreads(mrsc, wc_xy, &mtls);
         } else {
+            mtls.mSliceSize = mtls.fep.dimX / (dc->mWorkers.mCount * 4);
+            if(mtls.mSliceSize < 1) {
+                mtls.mSliceSize = 1;
+            }
+
             rsdLaunchThreads(mrsc, wc_x, &mtls);
         }
         dc->mInForEach = false;
@@ -409,9 +391,7 @@
         //ALOGE("launch 1");
     } else {
         RsForEachStubParamStruct p;
-        memset(&p, 0, sizeof(p));
-        p.usr = mtls.usr;
-        p.usr_len = mtls.usrLen;
+        memcpy(&p, &mtls.fep, sizeof(p));
         uint32_t sig = mtls.sig;
 
         //ALOGE("launch 3");
@@ -419,13 +399,11 @@
         for (p.ar[0] = mtls.arrayStart; p.ar[0] < mtls.arrayEnd; p.ar[0]++) {
             for (p.z = mtls.zStart; p.z < mtls.zEnd; p.z++) {
                 for (p.y = mtls.yStart; p.y < mtls.yEnd; p.y++) {
-                    uint32_t offset = mtls.dimX * mtls.dimY * mtls.dimZ * p.ar[0] +
-                                      mtls.dimX * mtls.dimY * p.z +
-                                      mtls.dimX * p.y;
-                    p.out = mtls.ptrOut + (mtls.eStrideOut * offset);
-                    p.in = mtls.ptrIn + (mtls.eStrideIn * offset);
-                    fn(&p, mtls.xStart, mtls.xEnd, mtls.eStrideIn,
-                       mtls.eStrideOut);
+                    uint32_t offset = mtls.fep.dimY * mtls.fep.dimZ * p.ar[0] +
+                                      mtls.fep.dimY * p.z + p.y;
+                    p.out = mtls.fep.ptrOut + (mtls.fep.yStrideOut * offset);
+                    p.in = mtls.fep.ptrIn + (mtls.fep.yStrideIn * offset);
+                    fn(&p, mtls.xStart, mtls.xEnd, mtls.fep.eStrideIn, mtls.fep.eStrideOut);
                 }
             }
         }
@@ -469,8 +447,8 @@
     //ALOGE("invoke %p %p %i %p %i", dc, script, slot, params, paramLength);
 
     Script * oldTLS = setTLS(script);
-    ((void (*)(const void *, uint32_t))
-        drv->mInvokeFunctions[slot])(params, paramLength);
+    reinterpret_cast<void (*)(const void *, uint32_t)>(
+        drv->mExecutable->getExportFuncAddrs()[slot])(params, paramLength);
     setTLS(oldTLS);
 }
 
@@ -480,7 +458,13 @@
     //rsAssert(!script->mFieldIsObject[slot]);
     //ALOGE("setGlobalVar %p %p %i %p %i", dc, script, slot, data, dataLength);
 
-    int32_t *destPtr = ((int32_t **)drv->mFieldAddress)[slot];
+    if (drv->mIntrinsicID) {
+        drv->mIntrinsicFuncs.setVar(dc, script, drv->mIntrinsicData, slot, data, dataLength);
+        return;
+    }
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          drv->mExecutable->getExportVarAddrs()[slot]);
     if (!destPtr) {
         //ALOGV("Calling setVar on slot = %i which is null", slot);
         return;
@@ -497,7 +481,8 @@
         const size_t *dims, size_t dimLength) {
     DrvScript *drv = (DrvScript *)script->mHal.drv;
 
-    int32_t *destPtr = ((int32_t **)drv->mFieldAddress)[slot];
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+        drv->mExecutable->getExportVarAddrs()[slot]);
     if (!destPtr) {
         //ALOGV("Calling setVar on slot = %i which is null", slot);
         return;
@@ -529,18 +514,31 @@
     memcpy(destPtr, data, dataLength);
 }
 
-void rsdScriptSetGlobalBind(const Context *dc, const Script *script, uint32_t slot, void *data) {
+void rsdScriptSetGlobalBind(const Context *dc, const Script *script, uint32_t slot, Allocation *data) {
     DrvScript *drv = (DrvScript *)script->mHal.drv;
+
     //rsAssert(!script->mFieldIsObject[slot]);
     //ALOGE("setGlobalBind %p %p %i %p", dc, script, slot, data);
 
-    int32_t *destPtr = ((int32_t **)drv->mFieldAddress)[slot];
+    if (drv->mIntrinsicID) {
+        drv->mIntrinsicFuncs.bind(dc, script, drv->mIntrinsicData, slot, data);
+        return;
+    }
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          drv->mExecutable->getExportVarAddrs()[slot]);
     if (!destPtr) {
         //ALOGV("Calling setVar on slot = %i which is null", slot);
         return;
     }
 
-    memcpy(destPtr, &data, sizeof(void *));
+    void *ptr = NULL;
+    drv->mBoundAllocs[slot] = data;
+    if(data) {
+        DrvAllocation *allocDrv = (DrvAllocation *)data->mHal.drv;
+        ptr = allocDrv->lod[0].mallocPtr;
+    }
+    memcpy(destPtr, &ptr, sizeof(void *));
 }
 
 void rsdScriptSetGlobalObj(const Context *dc, const Script *script, uint32_t slot, ObjectBase *data) {
@@ -548,7 +546,8 @@
     //rsAssert(script->mFieldIsObject[slot]);
     //ALOGE("setGlobalObj %p %p %i %p", dc, script, slot, data);
 
-    int32_t *destPtr = ((int32_t **)drv->mFieldAddress)[slot];
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          drv->mExecutable->getExportVarAddrs()[slot]);
     if (!destPtr) {
         //ALOGV("Calling setVar on slot = %i which is null", slot);
         return;
@@ -560,38 +559,62 @@
 void rsdScriptDestroy(const Context *dc, Script *script) {
     DrvScript *drv = (DrvScript *)script->mHal.drv;
 
-    if (drv->mFieldAddress) {
-        size_t exportVarCount = drv->ME->getExportVarCount();
-        for (size_t ct = 0; ct < exportVarCount; ct++) {
-            if (drv->mFieldIsObject[ct]) {
-                // The field address can be NULL if the script-side has
-                // optimized the corresponding global variable away.
-                if (drv->mFieldAddress[ct]) {
-                    rsrClearObject(dc, script, (ObjectBase **)drv->mFieldAddress[ct]);
+    if (drv == NULL) {
+        return;
+    }
+
+    if (drv->mExecutable) {
+        Vector<void *>::const_iterator var_addr_iter =
+            drv->mExecutable->getExportVarAddrs().begin();
+        Vector<void *>::const_iterator var_addr_end =
+            drv->mExecutable->getExportVarAddrs().end();
+
+        bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_iter =
+            drv->mExecutable->getInfo().getObjectSlots().begin();
+        bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_end =
+            drv->mExecutable->getInfo().getObjectSlots().end();
+
+        while ((var_addr_iter != var_addr_end) &&
+               (is_object_iter != is_object_end)) {
+            // The field address can be NULL if the script-side has optimized
+            // the corresponding global variable away.
+            ObjectBase **obj_addr =
+                reinterpret_cast<ObjectBase **>(*var_addr_iter);
+            if (*is_object_iter) {
+                if (*var_addr_iter != NULL) {
+                    rsrClearObject(dc, script, obj_addr);
                 }
             }
+            var_addr_iter++;
+            is_object_iter++;
         }
-        free(drv->mFieldAddress);
-        drv->mFieldAddress = NULL;
-        free(drv->mFieldIsObject);
-        drv->mFieldIsObject = NULL;    }
-
-    if (drv->mInvokeFunctions) {
-        free(drv->mInvokeFunctions);
-        drv->mInvokeFunctions = NULL;
     }
 
-    if (drv->mForEachFunctions) {
-        free(drv->mForEachFunctions);
-        drv->mForEachFunctions = NULL;
-    }
-
-    delete drv->ME;
-    drv->ME = NULL;
-
+    delete drv->mCompilerContext;
+    delete drv->mCompilerDriver;
+    delete drv->mExecutable;
+    delete[] drv->mBoundAllocs;
     free(drv);
     script->mHal.drv = NULL;
-
 }
 
+Allocation * rsdScriptGetAllocationForPointer(const android::renderscript::Context *dc,
+                                              const android::renderscript::Script *sc,
+                                              const void *ptr) {
+    DrvScript *drv = (DrvScript *)sc->mHal.drv;
+    if (!ptr) {
+        return NULL;
+    }
+
+    for (uint32_t ct=0; ct < sc->mHal.info.exportedVariableCount; ct++) {
+        Allocation *a = drv->mBoundAllocs[ct];
+        if (!a) continue;
+        DrvAllocation *adrv = (DrvAllocation *)a->mHal.drv;
+        if (adrv->lod[0].mallocPtr == ptr) {
+            return a;
+        }
+    }
+    ALOGE("rsGetAllocation, failed to find %p", ptr);
+    return NULL;
+}
 
diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h
index 7a4b138..114e6cf 100644
--- a/driver/rsdBcc.h
+++ b/driver/rsdBcc.h
@@ -24,6 +24,11 @@
 bool rsdScriptInit(const android::renderscript::Context *, android::renderscript::ScriptC *,
                    char const *resName, char const *cacheDir,
                    uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags);
+bool rsdInitIntrinsic(const android::renderscript::Context *rsc,
+                      android::renderscript::Script *s,
+                      RsScriptIntrinsicID iid,
+                      android::renderscript::Element *e);
+
 void rsdScriptInvokeFunction(const android::renderscript::Context *dc,
                              android::renderscript::Script *script,
                              uint32_t slot,
@@ -58,7 +63,7 @@
                                        size_t dimLength);
 void rsdScriptSetGlobalBind(const android::renderscript::Context *,
                             const android::renderscript::Script *,
-                            uint32_t slot, void *data);
+                            uint32_t slot, android::renderscript::Allocation *data);
 void rsdScriptSetGlobalObj(const android::renderscript::Context *,
                            const android::renderscript::Script *,
                            uint32_t slot, android::renderscript::ObjectBase *data);
@@ -76,5 +81,9 @@
 void rsdScriptDestroy(const android::renderscript::Context *dc,
                       android::renderscript::Script *script);
 
+android::renderscript::Allocation * rsdScriptGetAllocationForPointer(
+                        const android::renderscript::Context *dc,
+                        const android::renderscript::Script *script,
+                        const void *);
 
 #endif
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index f325087..d580a3d 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -36,7 +36,6 @@
 #include <cutils/properties.h>
 #include <sys/syscall.h>
 #include <string.h>
-#include <bcc/bcc.h>
 
 using namespace android;
 using namespace android::renderscript;
@@ -55,6 +54,7 @@
     SetPriority,
     {
         rsdScriptInit,
+        rsdInitIntrinsic,
         rsdScriptInvokeFunction,
         rsdScriptInvokeRoot,
         rsdScriptInvokeForEach,
@@ -80,11 +80,17 @@
         rsdAllocationData1D,
         rsdAllocationData2D,
         rsdAllocationData3D,
+        rsdAllocationRead1D,
+        rsdAllocationRead2D,
+        rsdAllocationRead3D,
+        rsdAllocationLock1D,
+        rsdAllocationUnlock1D,
         rsdAllocationData1D_alloc,
         rsdAllocationData2D_alloc,
         rsdAllocationData3D_alloc,
         rsdAllocationElementData1D,
-        rsdAllocationElementData2D
+        rsdAllocationElementData2D,
+        rsdAllocationGenerateMipmaps
     },
 
 
@@ -228,7 +234,7 @@
 
 
     int cpu = sysconf(_SC_NPROCESSORS_ONLN);
-    if(rsc->props.mDebugMaxThreads && (cpu > (int)rsc->props.mDebugMaxThreads)) {
+    if(rsc->props.mDebugMaxThreads) {
         cpu = rsc->props.mDebugMaxThreads;
     }
     if (cpu < 2) {
diff --git a/driver/rsdIntrinsicColorMatrix.cpp b/driver/rsdIntrinsicColorMatrix.cpp
new file mode 100644
index 0000000..bbeb1ef
--- /dev/null
+++ b/driver/rsdIntrinsicColorMatrix.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsdCore.h"
+#include "rsdIntrinsics.h"
+#include "rsdAllocation.h"
+
+#include "rsdIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+struct ConvolveParams {
+    float fp[16];
+    short ip[16];
+};
+
+static void ColorMatrix_SetVar(const Context *dc, const Script *script, void * intrinsicData,
+                               uint32_t slot, void *data, size_t dataLength) {
+    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
+
+    rsAssert(slot == 0);
+    memcpy (cp->fp, data, dataLength);
+    for(int ct=0; ct < 16; ct++) {
+        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
+    }
+}
+
+extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, const short *coef, uint32_t count);
+extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, const short *coef, uint32_t count);
+
+static void One(const RsForEachStubParamStruct *p, uchar4 *out,
+                const uchar4 *py, const float* coeff) {
+    float4 i = convert_float4(py[0]);
+
+    float4 sum;
+    sum.x = i.x * coeff[0] +
+            i.y * coeff[4] +
+            i.z * coeff[8] +
+            i.w * coeff[12];
+    sum.y = i.x * coeff[1] +
+            i.y * coeff[5] +
+            i.z * coeff[9] +
+            i.w * coeff[13];
+    sum.z = i.x * coeff[2] +
+            i.y * coeff[6] +
+            i.z * coeff[10] +
+            i.w * coeff[14];
+    sum.w = i.x * coeff[3] +
+            i.y * coeff[7] +
+            i.z * coeff[11] +
+            i.w * coeff[15];
+
+    sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
+    sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
+    sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
+    sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+
+    *out = convert_uchar4(sum);
+}
+
+static void ColorMatrix_uchar4(const RsForEachStubParamStruct *p,
+                                    uint32_t xstart, uint32_t xend,
+                                    uint32_t instep, uint32_t outstep) {
+    ConvolveParams *cp = (ConvolveParams *)p->usr;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    in += xstart;
+    out += xstart;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1) >> 2;
+        if(len > 0) {
+            rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+            x1 += len << 2;
+            out += len << 2;
+            in += len << 2;
+        }
+#endif
+
+        while(x1 != x2) {
+            One(p, out++, in++, cp->fp);
+            x1++;
+        }
+    }
+}
+
+void * rsdIntrinsic_InitColorMatrix(const android::renderscript::Context *dc,
+                                    android::renderscript::Script *script,
+                                    RsdIntriniscFuncs_t *funcs) {
+
+    script->mHal.info.exportedVariableCount = 1;
+    funcs->setVar = ColorMatrix_SetVar;
+    funcs->root = ColorMatrix_uchar4;
+
+    ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
+    cp->fp[0] = 1.f;
+    cp->fp[5] = 1.f;
+    cp->fp[10] = 1.f;
+    cp->fp[15] = 1.f;
+    for(int ct=0; ct < 16; ct++) {
+        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
+    }
+    return cp;
+}
+
+
diff --git a/driver/rsdIntrinsicConvolve3x3.cpp b/driver/rsdIntrinsicConvolve3x3.cpp
new file mode 100644
index 0000000..2aa7849
--- /dev/null
+++ b/driver/rsdIntrinsicConvolve3x3.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsdCore.h"
+#include "rsdIntrinsics.h"
+#include "rsdAllocation.h"
+
+#include "rsdIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+struct ConvolveParams {
+    float fp[16];
+    short ip[16];
+    ObjectBaseRef<Allocation> alloc;
+};
+
+static void Convolve3x3_Bind(const Context *dc, const Script *script,
+                             void * intrinsicData, uint32_t slot, Allocation *data) {
+    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
+    rsAssert(slot == 1);
+    cp->alloc.set(data);
+}
+
+static void Convolve3x3_SetVar(const Context *dc, const Script *script, void * intrinsicData,
+                               uint32_t slot, void *data, size_t dataLength) {
+    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
+
+    rsAssert(slot == 0);
+    memcpy (cp->fp, data, dataLength);
+    for(int ct=0; ct < 9; ct++) {
+        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
+    }
+}
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1, const void *y2, const short *coef, uint32_t count);
+
+
+static void ConvolveOne(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+                        const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
+                        const float* coeff) {
+
+    uint32_t x1 = rsMin((int32_t)x+1, (int32_t)p->dimX);
+    uint32_t x2 = rsMax((int32_t)x-1, 0);
+
+    float4 px = convert_float4(py0[x1]) * coeff[0] +
+                convert_float4(py0[x]) * coeff[1] +
+                convert_float4(py0[x2]) * coeff[2] +
+                convert_float4(py1[x1]) * coeff[3] +
+                convert_float4(py1[x]) * coeff[4] +
+                convert_float4(py1[x2]) * coeff[5] +
+                convert_float4(py2[x1]) * coeff[6] +
+                convert_float4(py2[x]) * coeff[7] +
+                convert_float4(py2[x2]) * coeff[8];
+
+    //px = clamp(px, 0.f, 255.f);
+    px.x = px.x < 0 ? 0 : (px.x > 255 ? 255 : px.x);
+    px.y = px.y < 0 ? 0 : (px.y > 255 ? 255 : px.y);
+    px.z = px.z < 0 ? 0 : (px.z > 255 ? 255 : px.z);
+    px.w = px.w < 0 ? 0 : (px.w > 255 ? 255 : px.w);
+
+    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
+    *out = o;
+}
+
+static void Convolve3x3_uchar4(const RsForEachStubParamStruct *p,
+                                    uint32_t xstart, uint32_t xend,
+                                    uint32_t instep, uint32_t outstep) {
+    ConvolveParams *cp = (ConvolveParams *)p->usr;
+    DrvAllocation *din = (DrvAllocation *)cp->alloc->mHal.drv;
+    const uchar *pin = (const uchar *)din->lod[0].mallocPtr;
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const uchar4 *py0 = (const uchar4 *)(pin + din->lod[0].stride * y2);
+    const uchar4 *py1 = (const uchar4 *)(pin + din->lod[0].stride * p->y);
+    const uchar4 *py2 = (const uchar4 *)(pin + din->lod[0].stride * y1);
+
+    uchar4 *out = (uchar4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if(x1 == 0) {
+        ConvolveOne(p, 0, out, py0, py1, py2, cp->fp);
+        x1 ++;
+        out++;
+    }
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1], &py1[x1], &py2[x1], cp->ip, len);
+            x1 += len << 1;
+            out += len << 1;
+        }
+#endif
+
+        while(x1 != x2) {
+            ConvolveOne(p, x1, out, py0, py1, py2, cp->fp);
+            out++;
+            x1++;
+        }
+    }
+}
+
+void * rsdIntrinsic_InitConvolve3x3(const android::renderscript::Context *dc,
+                                    android::renderscript::Script *script,
+                                    RsdIntriniscFuncs_t *funcs) {
+
+    script->mHal.info.exportedVariableCount = 2;
+    funcs->bind = Convolve3x3_Bind;
+    funcs->setVar = Convolve3x3_SetVar;
+    funcs->root = Convolve3x3_uchar4;
+
+    ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
+    for(int ct=0; ct < 9; ct++) {
+        cp->fp[ct] = 1.f / 9.f;
+        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
+    }
+    return cp;
+}
+
+
diff --git a/driver/rsdIntrinsicInlines.h b/driver/rsdIntrinsicInlines.h
new file mode 100644
index 0000000..10dcb1c
--- /dev/null
+++ b/driver/rsdIntrinsicInlines.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+typedef uint8_t uchar;
+typedef uint16_t ushort;
+typedef uint32_t uint;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef long long3 __attribute__((ext_vector_type(3)));
+typedef long long4 __attribute__((ext_vector_type(4)));
+
+enum IntrinsicEnums {
+    INTRINSIC_UNDEFINED,
+    INTRINSIC_CONVOLVE_3x3,
+    INTRINXIC_COLORMATRIX
+
+};
+
+static inline int4 convert_int4(uchar4 i) {
+    int4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
+static inline short4 convert_short4(uchar4 i) {
+    short4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
+static inline float4 convert_float4(uchar4 i) {
+    float4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
+static inline uchar4 convert_uchar4(int4 i) {
+    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
+    return f4;
+}
+
+static inline uchar4 convert_uchar4(float4 i) {
+    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
+    return f4;
+}
+
+
+static inline int4 clamp(int4 amount, int low, int high) {
+    int4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
diff --git a/driver/rsdIntrinsics.cpp b/driver/rsdIntrinsics.cpp
new file mode 100644
index 0000000..f53d08b
--- /dev/null
+++ b/driver/rsdIntrinsics.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsdCore.h"
+#include "rsdIntrinsics.h"
+#include "rsdAllocation.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+enum IntrinsicEnums {
+    INTRINSIC_UNDEFINED,
+    INTRINSIC_CONVOLVE_3x3,
+    INTRINXIC_COLORMATRIX
+
+};
+
+
+void * rsdIntrinsic_InitConvolve3x3(const Context *, Script *, RsdIntriniscFuncs_t *);
+void * rsdIntrinsic_InitColorMatrix(const Context *, Script *, RsdIntriniscFuncs_t *);
+
+
+static void Bind(const Context *, const Script *, void *, uint32_t, Allocation *) {
+    rsAssert(!"Intrinsic_Bind unexpectedly called");
+}
+
+static void SetVar(const Context *, const Script *, void *, uint32_t, void *, size_t) {
+    rsAssert(!"Intrinsic_Bind unexpectedly called");
+}
+
+static void Destroy(const Context *dc, const Script *script, void * intrinsicData) {
+    free(intrinsicData);
+}
+
+void * rsdIntrinsic_Init(const android::renderscript::Context *dc,
+                       android::renderscript::Script *script,
+                       RsScriptIntrinsicID iid,
+                       RsdIntriniscFuncs_t *funcs) {
+
+    funcs->bind = Bind;
+    funcs->setVar = SetVar;
+    funcs->destroy = Destroy;
+
+    switch(iid) {
+    case INTRINSIC_CONVOLVE_3x3:
+        return rsdIntrinsic_InitConvolve3x3(dc, script, funcs);
+    case INTRINXIC_COLORMATRIX:
+        return rsdIntrinsic_InitColorMatrix(dc, script, funcs);
+
+    default:
+        return NULL;
+    }
+    return NULL;
+}
+
+
+
diff --git a/driver/rsdIntrinsics.h b/driver/rsdIntrinsics.h
new file mode 100644
index 0000000..4a1a4a2
--- /dev/null
+++ b/driver/rsdIntrinsics.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_INTRINSICS_H
+#define RSD_INTRINSICS_H
+
+#include <rs_hal.h>
+
+typedef struct RsdIntriniscFuncs_rec {
+
+    void (*bind)(const android::renderscript::Context *dc,
+                 const android::renderscript::Script *script,
+                 void * intrinsicData,
+                 uint32_t slot, android::renderscript::Allocation *data);
+    void (*setVar)(const android::renderscript::Context *dc,
+                   const android::renderscript::Script *script,
+                   void * intrinsicData,
+                   uint32_t slot, void *data, size_t dataLength);
+    void (*root)(const android::renderscript::RsForEachStubParamStruct *,
+                 uint32_t x1, uint32_t x2, uint32_t instep, uint32_t outstep);
+
+    void (*destroy)(const android::renderscript::Context *dc,
+                    const android::renderscript::Script *script,
+                    void * intrinsicData);
+} RsdIntriniscFuncs_t;
+
+void * rsdIntrinsic_Init(const android::renderscript::Context *dc,
+                         android::renderscript::Script *script,
+                         RsScriptIntrinsicID id, RsdIntriniscFuncs_t *funcs);
+
+
+
+#endif // RSD_INTRINSICS_H
+
diff --git a/driver/rsdIntrinsics_Convolve.S b/driver/rsdIntrinsics_Convolve.S
new file mode 100644
index 0000000..a08658d
--- /dev/null
+++ b/driver/rsdIntrinsics_Convolve.S
@@ -0,0 +1,233 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        sp = coeffs
+        sp = length / 2
+*/
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        ldr r4, [sp, #8+64]
+        vld1.16 {q0}, [r4]!
+        vld1.16 {q1}, [r4]
+        ldr r4, [sp, #12+64]
+
+1:
+        vld1.8 {q13}, [r1]
+        vld1.8 {q14}, [r2]
+        vld1.8 {q15}, [r3]
+        add r1, r1, #8
+        add r2, r2, #8
+        add r3, r3, #8
+        PLD         (r1, #8)
+        PLD         (r2, #8)
+        PLD         (r3, #8)
+
+        vmovl.u8 q2, d26
+        vmovl.u8 q3, d27
+        vmovl.u8 q4, d28
+        vmovl.u8 q5, d29
+        vmovl.u8 q6, d30
+        vmovl.u8 q7, d31
+
+/*
+        The two pixel source array is
+        d4,  d5,  d6,  d7
+        d8,  d9,  d10, d11
+        d12, d13, d14, d15
+*/
+
+        vmull.s16 q8, d4, d0[0]
+        vmull.s16 q9, d5, d0[0]
+
+        vmlal.s16 q8, d5, d0[1]
+        vmlal.s16 q9, d6, d0[1]
+
+        vmlal.s16 q8, d6, d0[2]
+        vmlal.s16 q9, d7, d0[2]
+
+        vmlal.s16 q8, d4, d0[3]
+        vmlal.s16 q9, d9, d0[3]
+
+        vmlal.s16 q8, d9, d1[0]
+        vmlal.s16 q9, d10, d1[0]
+
+        vmlal.s16 q8, d10, d1[1]
+        vmlal.s16 q9, d11, d1[1]
+
+        vmlal.s16 q8, d12, d1[2]
+        vmlal.s16 q9, d13, d1[2]
+
+        vmlal.s16 q8, d13, d1[3]
+        vmlal.s16 q9, d14, d1[3]
+
+        vmlal.s16 q8, d14, d2[0]
+        vmlal.s16 q9, d15, d2[0]
+
+        vshrn.i32 d16, q8, #8
+        vshrn.i32 d17, q9, #8
+
+        vqmovun.s16 d16, q8
+        vst1.8 d16, [r0]!
+
+        subs r4, r4, #1
+        bne 1b
+
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(TestConvolveK)
+
+
+/*
+        r0 = dst
+        r1 = src
+        r2 = matrx
+        r3 = length
+*/
+ENTRY(rsdIntrinsicColorMatrix4x4_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        vld1.16 {q2}, [r2]!
+        vld1.16 {q3}, [r2]!
+
+1:
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        vmull.s16 q8,  d24, d4[0]
+        vmull.s16 q9,  d26, d4[1]
+        vmull.s16 q10, d28, d4[2]
+        vmull.s16 q11, d30, d4[3]
+
+        vmlal.s16 q8,  d24, d5[0]
+        vmlal.s16 q9,  d26, d5[1]
+        vmlal.s16 q10, d28, d5[2]
+        vmlal.s16 q11, d30, d5[3]
+
+        vmlal.s16 q8,  d24, d6[0]
+        vmlal.s16 q9,  d26, d6[1]
+        vmlal.s16 q10, d28, d6[2]
+        vmlal.s16 q11, d30, d6[3]
+
+        vmlal.s16 q8,  d24, d7[0]
+        vmlal.s16 q9,  d26, d7[1]
+        vmlal.s16 q10, d28, d7[2]
+        vmlal.s16 q11, d30, d7[3]
+
+        vshrn.i32 d24, q8, #8
+        vshrn.i32 d26, q9, #8
+        vshrn.i32 d28, q10, #8
+        vshrn.i32 d30, q11, #8
+
+        vqmovun.s16 d0, q12
+        vqmovun.s16 d1, q13
+        vqmovun.s16 d2, q14
+        vqmovun.s16 d3, q15
+
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+        subs r3, r3, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicColorMatrix4x4K)
+
+/*
+        r0 = dst
+        r1 = src
+        r2 = matrx
+        r3 = length
+*/
+ENTRY(rsdIntrinsicColorMatrix3x3K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        vld1.16 {q2}, [r2]!
+        vld1.16 {q3}, [r2]!
+
+1:
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+
+        vmull.s16 q8,  d24, d4[0]
+        vmull.s16 q9,  d26, d4[1]
+        vmull.s16 q10, d28, d4[2]
+
+        vmlal.s16 q8,  d24, d5[0]
+        vmlal.s16 q9,  d26, d5[1]
+        vmlal.s16 q10, d28, d5[2]
+
+        vmlal.s16 q8,  d24, d6[0]
+        vmlal.s16 q9,  d26, d6[1]
+        vmlal.s16 q10, d28, d6[2]
+
+        vshrn.i32 d24, q8, #8
+        vshrn.i32 d26, q9, #8
+        vshrn.i32 d28, q10, #8
+
+        vqmovun.s16 d0, q12
+        vqmovun.s16 d1, q13
+        vqmovun.s16 d2, q14
+
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+        subs r3, r3, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicColorMatrix3x3K)
+
diff --git a/driver/rsdMeshObj.cpp b/driver/rsdMeshObj.cpp
index 5dab84b..92e02be 100644
--- a/driver/rsdMeshObj.cpp
+++ b/driver/rsdMeshObj.cpp
@@ -151,7 +151,7 @@
             mAttribs[ct].ptr = NULL;
         } else {
             mAttribs[ct].buffer = 0;
-            mAttribs[ct].ptr = (const uint8_t*)alloc->mHal.drvState.mallocPtr;
+            mAttribs[ct].ptr = (const uint8_t*)alloc->mHal.drvState.mallocPtrLOD0;
         }
     }
 
@@ -172,7 +172,7 @@
         } else {
             RSD_CALL_GL(glBindBuffer, GL_ELEMENT_ARRAY_BUFFER, 0);
             RSD_CALL_GL(glDrawElements, mGLPrimitives[primIndex], len, GL_UNSIGNED_SHORT,
-                        idxAlloc->mHal.drvState.mallocPtr);
+                        idxAlloc->mHal.drvState.mallocPtrLOD0);
         }
     } else {
         RSD_CALL_GL(glDrawArrays, mGLPrimitives[primIndex], start, len);
diff --git a/driver/rsdPath.cpp b/driver/rsdPath.cpp
index e04bc02..79ec487 100644
--- a/driver/rsdPath.cpp
+++ b/driver/rsdPath.cpp
@@ -112,7 +112,7 @@
 DrvPathStatic::DrvPathStatic(const Allocation *vtx, const Allocation *loops) {
     mSegmentCount = vtx->getType()->getDimX() / 3;
     mSegments = new segment_t[mSegmentCount];
-
+/*
     const float *fin = (const float *)vtx->getPtr();
     for (uint32_t ct=0; ct < mSegmentCount; ct++) {
         segment_t *s = &mSegments[ct];
@@ -126,6 +126,7 @@
         s->y2 = fin[5];
         fin += 6;
     }
+    */
 }
 
 DrvPathStatic::~DrvPathStatic() {
diff --git a/driver/rsdRuntime.h b/driver/rsdRuntime.h
index 840eced..dc84032 100644
--- a/driver/rsdRuntime.h
+++ b/driver/rsdRuntime.h
@@ -18,7 +18,6 @@
 #define RSD_RUNTIME_STUBS_H
 
 #include <rs_hal.h>
-#include <bcc/bcc.h>
 
 #include "rsMutex.h"
 
diff --git a/driver/rsdRuntimeMath.cpp b/driver/rsdRuntimeMath.cpp
index 0a233f6..48a1e85 100644
--- a/driver/rsdRuntimeMath.cpp
+++ b/driver/rsdRuntimeMath.cpp
@@ -44,10 +44,6 @@
     return log10(v) / log10(2.f);
 }
 
-static float SC_mad(float v1, float v2, float v3) {
-    return v1 * v2 + v3;
-}
-
 #if 0
 static float SC_pown(float v, int p) {
     return powf(v, (float)p);
@@ -109,10 +105,6 @@
     return amount < low ? low : (amount > high ? high : amount);
 }
 
-static float SC_degrees(float radians) {
-    return radians * (180.f / M_PI);
-}
-
 static float SC_max_f32(float v, float v2) {
     return rsMax(v, v2);
 }
@@ -121,15 +113,6 @@
     return rsMin(v, v2);
 }
 
-static float SC_mix_f32(float start, float stop, float amount) {
-    //ALOGE("lerpf %f  %f  %f", start, stop, amount);
-    return start + (stop - start) * amount;
-}
-
-static float SC_radians(float degrees) {
-    return degrees * (M_PI / 180.f);
-}
-
 static float SC_step_f32(float edge, float v) {
     if (v < edge) return 0.f;
     return 1.f;
@@ -430,7 +413,6 @@
     { "_Z5log10f", (void *)&log10f, true },
     { "_Z5log1pf", (void *)&log1pf, true },
     { "_Z4logbf", (void *)&logbf, true },
-    { "_Z3madfff", (void *)&SC_mad, true },
     { "_Z4modffPf", (void *)&modff, true },
     //{ "_Z3nanj", (void *)&SC_nan, true },
     { "_Z9nextafterff", (void *)&nextafterf, true },
@@ -473,11 +455,8 @@
     { "_Z3mincc", (void *)&SC_min_i8, true },
 
     { "_Z5clampfff", (void *)&SC_clamp_f32, true },
-    { "_Z7degreesf", (void *)&SC_degrees, true },
     { "_Z3maxff", (void *)&SC_max_f32, true },
     { "_Z3minff", (void *)&SC_min_f32, true },
-    { "_Z3mixfff", (void *)&SC_mix_f32, true },
-    { "_Z7radiansf", (void *)&SC_radians, true },
     { "_Z4stepff", (void *)&SC_step_f32, true },
     //{ "smoothstep", (void *)&, true },
     { "_Z4signf", (void *)&SC_sign_f32, true },
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index a24bba8..da92839 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -23,6 +23,7 @@
 
 #include "utils/Timers.h"
 #include "rsdCore.h"
+#include "rsdBcc.h"
 
 #include "rsdRuntime.h"
 #include "rsdPath.h"
@@ -38,6 +39,33 @@
     Context * rsc = tls->mContext; \
     ScriptC * sc = (ScriptC *) tls->mScript
 
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+typedef unsigned char uchar3 __attribute__((ext_vector_type(3)));
+typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
+typedef unsigned short ushort3 __attribute__((ext_vector_type(3)));
+typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
+typedef int32_t int2 __attribute__((ext_vector_type(2)));
+typedef int32_t int3 __attribute__((ext_vector_type(3)));
+typedef int32_t int4 __attribute__((ext_vector_type(4)));
+typedef uint32_t uint2 __attribute__((ext_vector_type(2)));
+typedef uint32_t uint3 __attribute__((ext_vector_type(3)));
+typedef uint32_t uint4 __attribute__((ext_vector_type(4)));
+typedef long long long2 __attribute__((ext_vector_type(2)));
+typedef long long long3 __attribute__((ext_vector_type(3)));
+typedef long long long4 __attribute__((ext_vector_type(4)));
+typedef unsigned long long ulong2 __attribute__((ext_vector_type(2)));
+typedef unsigned long long ulong3 __attribute__((ext_vector_type(3)));
+typedef unsigned long long ulong4 __attribute__((ext_vector_type(4)));
 
 
 //////////////////////////////////////////////////////////////////////////////
@@ -353,7 +381,7 @@
 
 static const Allocation * SC_GetAllocation(const void *ptr) {
     GET_TLS();
-    return rsrGetAllocation(rsc, sc, ptr);
+    return rsdScriptGetAllocationForPointer(rsc, sc, ptr);
 }
 
 static void SC_ForEach_SAA(Script *target,
@@ -482,6 +510,15 @@
 static void SC_debugFv4(const char *s, float f1, float f2, float f3, float f4) {
     ALOGD("%s {%f, %f, %f, %f}", s, f1, f2, f3, f4);
 }
+static void SC_debugF2(const char *s, float2 f) {
+    ALOGD("%s {%f, %f}", s, f.x, f.y);
+}
+static void SC_debugF3(const char *s, float3 f) {
+    ALOGD("%s {%f, %f, %f}", s, f.x, f.y, f.z);
+}
+static void SC_debugF4(const char *s, float4 f) {
+    ALOGD("%s {%f, %f, %f, %f}", s, f.x, f.y, f.z, f.w);
+}
 static void SC_debugD(const char *s, double d) {
     ALOGD("%s %f, 0x%08llx", s, d, *((long long *) (&d)));
 }
@@ -500,20 +537,102 @@
     ALOGD("%s {%f, %f", s, f[0], f[2]);
     ALOGD("%s  %f, %f}",s, f[1], f[3]);
 }
-
+static void SC_debugI8(const char *s, char c) {
+    ALOGD("%s %hhd  0x%hhx", s, c, (unsigned char)c);
+}
+static void SC_debugC2(const char *s, char2 c) {
+    ALOGD("%s {%hhd, %hhd}  0x%hhx 0x%hhx", s, c.x, c.y, (unsigned char)c.x, (unsigned char)c.y);
+}
+static void SC_debugC3(const char *s, char3 c) {
+    ALOGD("%s {%hhd, %hhd, %hhd}  0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, (unsigned char)c.x, (unsigned char)c.y, (unsigned char)c.z);
+}
+static void SC_debugC4(const char *s, char4 c) {
+    ALOGD("%s {%hhd, %hhd, %hhd, %hhd}  0x%hhx 0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.w, (unsigned char)c.x, (unsigned char)c.y, (unsigned char)c.z, (unsigned char)c.w);
+}
+static void SC_debugU8(const char *s, unsigned char c) {
+    ALOGD("%s %hhu  0x%hhx", s, c, c);
+}
+static void SC_debugUC2(const char *s, uchar2 c) {
+    ALOGD("%s {%hhu, %hhu}  0x%hhx 0x%hhx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugUC3(const char *s, uchar3 c) {
+    ALOGD("%s {%hhu, %hhu, %hhu}  0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugUC4(const char *s, uchar4 c) {
+    ALOGD("%s {%hhu, %hhu, %hhu, %hhu}  0x%hhx 0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
+static void SC_debugI16(const char *s, short c) {
+    ALOGD("%s %hd  0x%hx", s, c, c);
+}
+static void SC_debugS2(const char *s, short2 c) {
+    ALOGD("%s {%hd, %hd}  0x%hx 0x%hx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugS3(const char *s, short3 c) {
+    ALOGD("%s {%hd, %hd, %hd}  0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugS4(const char *s, short4 c) {
+    ALOGD("%s {%hd, %hd, %hd, %hd}  0x%hx 0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
+static void SC_debugU16(const char *s, unsigned short c) {
+    ALOGD("%s %hu  0x%hx", s, c, c);
+}
+static void SC_debugUS2(const char *s, ushort2 c) {
+    ALOGD("%s {%hu, %hu}  0x%hx 0x%hx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugUS3(const char *s, ushort3 c) {
+    ALOGD("%s {%hu, %hu, %hu}  0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugUS4(const char *s, ushort4 c) {
+    ALOGD("%s {%hu, %hu, %hu, %hu}  0x%hx 0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
 static void SC_debugI32(const char *s, int32_t i) {
-    ALOGD("%s %i  0x%x", s, i, i);
+    ALOGD("%s %d  0x%x", s, i, i);
+}
+static void SC_debugI2(const char *s, int2 i) {
+    ALOGD("%s {%d, %d}  0x%x 0x%x", s, i.x, i.y, i.x, i.y);
+}
+static void SC_debugI3(const char *s, int3 i) {
+    ALOGD("%s {%d, %d, %d}  0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.x, i.y, i.z);
+}
+static void SC_debugI4(const char *s, int4 i) {
+    ALOGD("%s {%d, %d, %d, %d}  0x%x 0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.w, i.x, i.y, i.z, i.w);
 }
 static void SC_debugU32(const char *s, uint32_t i) {
     ALOGD("%s %u  0x%x", s, i, i);
 }
+static void SC_debugUI2(const char *s, uint2 i) {
+    ALOGD("%s {%u, %u}  0x%x 0x%x", s, i.x, i.y, i.x, i.y);
+}
+static void SC_debugUI3(const char *s, uint3 i) {
+    ALOGD("%s {%u, %u, %u}  0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.x, i.y, i.z);
+}
+static void SC_debugUI4(const char *s, uint4 i) {
+    ALOGD("%s {%u, %u, %u, %u}  0x%x 0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.w, i.x, i.y, i.z, i.w);
+}
 static void SC_debugLL64(const char *s, long long ll) {
     ALOGD("%s %lld  0x%llx", s, ll, ll);
 }
+static void SC_debugL2(const char *s, long2 ll) {
+    ALOGD("%s {%lld, %lld}  0x%llx 0x%llx", s, ll.x, ll.y, ll.x, ll.y);
+}
+static void SC_debugL3(const char *s, long3 ll) {
+    ALOGD("%s {%lld, %lld, %lld}  0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.x, ll.y, ll.z);
+}
+static void SC_debugL4(const char *s, long4 ll) {
+    ALOGD("%s {%lld, %lld, %lld, %lld}  0x%llx 0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.w, ll.x, ll.y, ll.z, ll.w);
+}
 static void SC_debugULL64(const char *s, unsigned long long ll) {
     ALOGD("%s %llu  0x%llx", s, ll, ll);
 }
-
+static void SC_debugUL2(const char *s, ulong2 ll) {
+    ALOGD("%s {%llu, %llu}  0x%llx 0x%llx", s, ll.x, ll.y, ll.x, ll.y);
+}
+static void SC_debugUL3(const char *s, ulong3 ll) {
+    ALOGD("%s {%llu, %llu, %llu}  0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.x, ll.y, ll.z);
+}
+static void SC_debugUL4(const char *s, ulong4 ll) {
+    ALOGD("%s {%llu, %llu, %llu, %llu}  0x%llx 0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.w, ll.x, ll.y, ll.z, ll.w);
+}
 static void SC_debugP(const char *s, const void *p) {
     ALOGD("%s %p", s, p);
 }
@@ -683,19 +802,56 @@
     { "_Z7rsDebugPKcff", (void *)&SC_debugFv2, true },
     { "_Z7rsDebugPKcfff", (void *)&SC_debugFv3, true },
     { "_Z7rsDebugPKcffff", (void *)&SC_debugFv4, true },
+    { "_Z7rsDebugPKcDv2_f", (void *)&SC_debugF2, true },
+    { "_Z7rsDebugPKcDv3_f", (void *)&SC_debugF3, true },
+    { "_Z7rsDebugPKcDv4_f", (void *)&SC_debugF4, true },
     { "_Z7rsDebugPKcd", (void *)&SC_debugD, true },
     { "_Z7rsDebugPKcPK12rs_matrix4x4", (void *)&SC_debugFM4v4, true },
     { "_Z7rsDebugPKcPK12rs_matrix3x3", (void *)&SC_debugFM3v3, true },
     { "_Z7rsDebugPKcPK12rs_matrix2x2", (void *)&SC_debugFM2v2, true },
+    { "_Z7rsDebugPKcc", (void *)&SC_debugI8, true },
+    { "_Z7rsDebugPKcDv2_c", (void *)&SC_debugC2, true },
+    { "_Z7rsDebugPKcDv3_c", (void *)&SC_debugC3, true },
+    { "_Z7rsDebugPKcDv4_c", (void *)&SC_debugC4, true },
+    { "_Z7rsDebugPKch", (void *)&SC_debugU8, true },
+    { "_Z7rsDebugPKcDv2_h", (void *)&SC_debugUC2, true },
+    { "_Z7rsDebugPKcDv3_h", (void *)&SC_debugUC3, true },
+    { "_Z7rsDebugPKcDv4_h", (void *)&SC_debugUC4, true },
+    { "_Z7rsDebugPKcs", (void *)&SC_debugI16, true },
+    { "_Z7rsDebugPKcDv2_s", (void *)&SC_debugS2, true },
+    { "_Z7rsDebugPKcDv3_s", (void *)&SC_debugS3, true },
+    { "_Z7rsDebugPKcDv4_s", (void *)&SC_debugS4, true },
+    { "_Z7rsDebugPKct", (void *)&SC_debugU16, true },
+    { "_Z7rsDebugPKcDv2_t", (void *)&SC_debugUS2, true },
+    { "_Z7rsDebugPKcDv3_t", (void *)&SC_debugUS3, true },
+    { "_Z7rsDebugPKcDv4_t", (void *)&SC_debugUS4, true },
     { "_Z7rsDebugPKci", (void *)&SC_debugI32, true },
+    { "_Z7rsDebugPKcDv2_i", (void *)&SC_debugI2, true },
+    { "_Z7rsDebugPKcDv3_i", (void *)&SC_debugI3, true },
+    { "_Z7rsDebugPKcDv4_i", (void *)&SC_debugI4, true },
     { "_Z7rsDebugPKcj", (void *)&SC_debugU32, true },
+    { "_Z7rsDebugPKcDv2_j", (void *)&SC_debugUI2, true },
+    { "_Z7rsDebugPKcDv3_j", (void *)&SC_debugUI3, true },
+    { "_Z7rsDebugPKcDv4_j", (void *)&SC_debugUI4, true },
     // Both "long" and "unsigned long" need to be redirected to their
     // 64-bit counterparts, since we have hacked Slang to use 64-bit
     // for "long" on Arm (to be similar to Java).
     { "_Z7rsDebugPKcl", (void *)&SC_debugLL64, true },
+    { "_Z7rsDebugPKcDv2_l", (void *)&SC_debugL2, true },
+    { "_Z7rsDebugPKcDv3_l", (void *)&SC_debugL3, true },
+    { "_Z7rsDebugPKcDv4_l", (void *)&SC_debugL4, true },
     { "_Z7rsDebugPKcm", (void *)&SC_debugULL64, true },
+    { "_Z7rsDebugPKcDv2_m", (void *)&SC_debugUL2, true },
+    { "_Z7rsDebugPKcDv3_m", (void *)&SC_debugUL3, true },
+    { "_Z7rsDebugPKcDv4_m", (void *)&SC_debugUL4, true },
     { "_Z7rsDebugPKcx", (void *)&SC_debugLL64, true },
+    { "_Z7rsDebugPKcDv2_x", (void *)&SC_debugL2, true },
+    { "_Z7rsDebugPKcDv3_x", (void *)&SC_debugL3, true },
+    { "_Z7rsDebugPKcDv4_x", (void *)&SC_debugL4, true },
     { "_Z7rsDebugPKcy", (void *)&SC_debugULL64, true },
+    { "_Z7rsDebugPKcDv2_y", (void *)&SC_debugUL2, true },
+    { "_Z7rsDebugPKcDv3_y", (void *)&SC_debugUL3, true },
+    { "_Z7rsDebugPKcDv4_y", (void *)&SC_debugUL4, true },
     { "_Z7rsDebugPKcPKv", (void *)&SC_debugP, true },
 
     { NULL, NULL, false }
@@ -704,13 +860,6 @@
 
 void* rsdLookupRuntimeStub(void* pContext, char const* name) {
     ScriptC *s = (ScriptC *)pContext;
-    if (!strcmp(name, "__isThreadable")) {
-      return (void*) s->mHal.info.isThreadable;
-    } else if (!strcmp(name, "__clearThreadable")) {
-      s->mHal.info.isThreadable = false;
-      return NULL;
-    }
-
     RsdSymbolTable *syms = gSyms;
     const RsdSymbolTable *sym = rsdLookupSymbolMath(name);
 
diff --git a/driver/rsdShader.cpp b/driver/rsdShader.cpp
index d39bdb8..3654090 100644
--- a/driver/rsdShader.cpp
+++ b/driver/rsdShader.cpp
@@ -516,6 +516,7 @@
     uint32_t uidx = 0;
     for (uint32_t ct=0; ct < mRSProgram->mHal.state.constantsCount; ct++) {
         Allocation *alloc = mRSProgram->mHal.state.constants[ct];
+
         if (!alloc) {
             ALOGE("Attempting to set constants on shader id %u, but alloc at slot %u is not set",
                  (uint32_t)this, ct);
@@ -523,7 +524,8 @@
             continue;
         }
 
-        const uint8_t *data = static_cast<const uint8_t *>(alloc->getPtr());
+        DrvAllocation *adrv = (DrvAllocation *)alloc->mHal.drv;
+        const uint8_t *data = static_cast<const uint8_t *>(adrv->lod[0].mallocPtr);
         const Element *e = mRSProgram->mHal.state.constantTypes[ct]->getElement();
         for (uint32_t field=0; field < e->mHal.state.fieldsCount; field++) {
             const Element *f = e->mHal.state.fields[field];
diff --git a/rs.spec b/rs.spec
index f32443f..607f7dc 100644
--- a/rs.spec
+++ b/rs.spec
@@ -346,6 +346,11 @@
     ret RsScript
     }
 
+ScriptIntrinsicCreate {
+    param uint32_t id
+    param RsElement eid
+    ret RsScript
+    }
 
 ProgramStoreCreate {
     direct
diff --git a/rsAdapter.cpp b/rsAdapter.cpp
index 41811ae..13a728f 100644
--- a/rsAdapter.cpp
+++ b/rsAdapter.cpp
@@ -34,33 +34,14 @@
     mY = 0;
     mZ = 0;
     mLOD = 0;
-    mFace = 0;
+    mFace = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X;
 }
 
-void * Adapter1D::getElement(uint32_t x) {
-    rsAssert(mAllocation.get());
-    rsAssert(mAllocation->getPtr());
-    rsAssert(mAllocation->getType());
-    uint8_t * ptr = static_cast<uint8_t *>(mAllocation->getPtr());
-    ptr += mAllocation->getType()->getLODOffset(mLOD, x, mY);
-    return ptr;
+void Adapter1D::data(Context *rsc, uint32_t x, uint32_t count, const void *data, size_t sizeBytes) {
+    mAllocation->data(rsc, x, mY, mLOD, mFace, count, 1, data, sizeBytes);
 }
 
-void Adapter1D::subData(uint32_t xoff, uint32_t count, const void *data) {
-    if (mAllocation.get() && mAllocation.get()->getType()) {
-        void *ptr = getElement(xoff);
-        count *= mAllocation.get()->getType()->getElementSizeBytes();
-        memcpy(ptr, data, count);
-    }
-}
-
-void Adapter1D::data(const void *data) {
-    memcpy(getElement(0),
-           data,
-           mAllocation.get()->getType()->getSizeBytes());
-}
-
-void Adapter1D::serialize(OStream *stream) const {
+void Adapter1D::serialize(Context *rsc, OStream *stream) const {
 }
 
 Adapter1D *Adapter1D::createFromStream(Context *rsc, IStream *stream) {
@@ -98,7 +79,7 @@
         a->setLOD(value);
         break;
     case RS_DIMENSION_FACE:
-        a->setFace(value);
+        a->setFace((RsAllocationCubemapFace)value);
         break;
     default:
         rsAssert(!"Unimplemented constraint");
@@ -106,16 +87,6 @@
     }
 }
 
-void rsi_Adapter1DSubData(Context *rsc, RsAdapter1D va, uint32_t xoff, uint32_t count, const void *data) {
-    Adapter1D * a = static_cast<Adapter1D *>(va);
-    a->subData(xoff, count, data);
-}
-
-void rsi_Adapter1DData(Context *rsc, RsAdapter1D va, const void *data) {
-    Adapter1D * a = static_cast<Adapter1D *>(va);
-    a->data(data);
-}
-
 }
 }
 
@@ -133,51 +104,17 @@
 void Adapter2D::reset() {
     mZ = 0;
     mLOD = 0;
-    mFace = 0;
+    mFace = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X;
 }
 
-void * Adapter2D::getElement(uint32_t x, uint32_t y) const {
-    rsAssert(mAllocation.get());
-    rsAssert(mAllocation->getPtr());
-    rsAssert(mAllocation->getType());
-    if (mFace != 0 && !mAllocation->getType()->getDimFaces()) {
-        ALOGE("Adapter wants cubemap face, but allocation has none");
-        return NULL;
-    }
 
-    uint8_t * ptr = static_cast<uint8_t *>(mAllocation->getPtr());
-    ptr += mAllocation->getType()->getLODOffset(mLOD, x, y);
-
-    if (mFace != 0) {
-        uint32_t totalSizeBytes = mAllocation->getType()->getSizeBytes();
-        uint32_t faceOffset = totalSizeBytes / 6;
-        ptr += faceOffset * mFace;
-    }
-    return ptr;
+void Adapter2D::data(Context *rsc, uint32_t x, uint32_t y, uint32_t w, uint32_t h,
+                     const void *data, size_t sizeBytes) {
+    mAllocation->data(rsc, x, y, mLOD, mFace, w, h, data, sizeBytes);
 }
 
-void Adapter2D::subData(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h, const void *data) {
-    rsAssert(mAllocation.get());
-    rsAssert(mAllocation->getPtr());
-    rsAssert(mAllocation->getType());
 
-    uint32_t eSize = mAllocation.get()->getType()->getElementSizeBytes();
-    uint32_t lineSize = eSize * w;
-
-    const uint8_t *src = static_cast<const uint8_t *>(data);
-    for (uint32_t line=yoff; line < (yoff+h); line++) {
-        memcpy(getElement(xoff, line), src, lineSize);
-        src += lineSize;
-    }
-}
-
-void Adapter2D::data(const void *data) {
-    memcpy(getElement(0,0),
-           data,
-           mAllocation.get()->getType()->getSizeBytes());
-}
-
-void Adapter2D::serialize(OStream *stream) const {
+void Adapter2D::serialize(Context *rsc, OStream *stream) const {
 }
 
 Adapter2D *Adapter2D::createFromStream(Context *rsc, IStream *stream) {
@@ -216,7 +153,7 @@
         a->setLOD(value);
         break;
     case RS_DIMENSION_FACE:
-        a->setFace(value);
+        a->setFace((RsAllocationCubemapFace)value);
         break;
     default:
         rsAssert(!"Unimplemented constraint");
@@ -224,15 +161,6 @@
     }
 }
 
-void rsi_Adapter2DData(Context *rsc, RsAdapter2D va, const void *data) {
-    Adapter2D * a = static_cast<Adapter2D *>(va);
-    a->data(data);
-}
-
-void rsi_Adapter2DSubData(Context *rsc, RsAdapter2D va, uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h, const void *data) {
-    Adapter2D * a = static_cast<Adapter2D *>(va);
-    a->subData(xoff, yoff, w, h, data);
-}
 
 }
 }
diff --git a/rsAdapter.h b/rsAdapter.h
index d150789..7b189cf 100644
--- a/rsAdapter.h
+++ b/rsAdapter.h
@@ -32,7 +32,6 @@
     Adapter1D(Context *);
     Adapter1D(Context *, Allocation *);
     void reset();
-    void * getElement(uint32_t x);
 
     void setAllocation(Allocation *a) {mAllocation.set(a);}
 
@@ -43,13 +42,11 @@
     inline void setY(uint32_t y) {mY = y;}
     inline void setZ(uint32_t z) {mZ = z;}
     inline void setLOD(uint32_t lod) {mLOD = lod;}
-    inline void setFace(uint32_t face) {mFace = face;}
-    //void setArray(uint32_t num, uint32_t value);
+    inline void setFace(RsAllocationCubemapFace face) {mFace = face;}
 
-    void subData(uint32_t xoff, uint32_t count, const void *data);
-    void data(const void *data);
+    void data(Context *rsc, uint32_t xoff, uint32_t count, const void *data, size_t sizeBytes);
 
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_ADAPTER_1D; }
     static Adapter1D *createFromStream(Context *rsc, IStream *stream);
 
@@ -58,7 +55,7 @@
     uint32_t mY;
     uint32_t mZ;
     uint32_t mLOD;
-    uint32_t mFace;
+    RsAllocationCubemapFace mFace;
 };
 
 class Adapter2D : public ObjectBase {
@@ -69,7 +66,6 @@
     Adapter2D(Context *);
     Adapter2D(Context *, Allocation *);
     void reset();
-    void * getElement(uint32_t x, uint32_t y) const;
 
     uint32_t getDimX() const {return mAllocation->getType()->getLODDimX(mLOD);}
     uint32_t getDimY() const {return mAllocation->getType()->getLODDimY(mLOD);}
@@ -78,13 +74,12 @@
     void setAllocation(Allocation *a) {mAllocation.set(a);}
     inline void setZ(uint32_t z) {mZ = z;}
     inline void setLOD(uint32_t lod) {mLOD = lod;}
-    inline void setFace(uint32_t face) {mFace = face;}
-    //void setArray(uint32_t num, uint32_t value);
+    inline void setFace(RsAllocationCubemapFace face) {mFace = face;}
 
-    void data(const void *data);
-    void subData(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h, const void *data);
+    void data(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h,
+              const void *data, size_t sizeBytes);
 
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_ADAPTER_2D; }
     static Adapter2D *createFromStream(Context *rsc, IStream *stream);
 
@@ -92,7 +87,7 @@
     ObjectBaseRef<Allocation> mAllocation;
     uint32_t mZ;
     uint32_t mLOD;
-    uint32_t mFace;
+    RsAllocationCubemapFace mFace;
 };
 
 }
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 5d09936..ca747e7 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -60,6 +60,7 @@
     mHal.state.hasMipmaps = type->getDimLOD();
     mHal.state.elementSizeBytes = type->getElementSizeBytes();
     mHal.state.hasReferences = mHal.state.type->getElement()->getHasReferences();
+    mHal.state.eType = mHal.state.type->getElement()->getType();
 }
 
 Allocation::~Allocation() {
@@ -72,10 +73,6 @@
     rsc->mHal.funcs.allocation.syncAll(rsc, this, src);
 }
 
-void Allocation::read(void *data) {
-    memcpy(data, getPtr(), mHal.state.type->getSizeBytes());
-}
-
 void Allocation::data(Context *rsc, uint32_t xoff, uint32_t lod,
                          uint32_t count, const void *data, size_t sizeBytes) {
     const size_t eSize = mHal.state.type->getElementSizeBytes();
@@ -113,6 +110,39 @@
                       uint32_t w, uint32_t h, uint32_t d, const void *data, size_t sizeBytes) {
 }
 
+void Allocation::read(Context *rsc, uint32_t xoff, uint32_t lod,
+                         uint32_t count, void *data, size_t sizeBytes) {
+    const size_t eSize = mHal.state.type->getElementSizeBytes();
+
+    if ((count * eSize) != sizeBytes) {
+        ALOGE("Allocation::read called with mismatched size expected %zu, got %zu",
+             (count * eSize), sizeBytes);
+        mHal.state.type->dumpLOGV("type info");
+        return;
+    }
+
+    rsc->mHal.funcs.allocation.read1D(rsc, this, xoff, lod, count, data, sizeBytes);
+}
+
+void Allocation::read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+             uint32_t w, uint32_t h, void *data, size_t sizeBytes) {
+    const size_t eSize = mHal.state.elementSizeBytes;
+    const size_t lineSize = eSize * w;
+
+    if ((lineSize * h) != sizeBytes) {
+        ALOGE("Allocation size mismatch, expected %zu, got %zu", (lineSize * h), sizeBytes);
+        rsAssert(!"Allocation::read called with mismatched size");
+        return;
+    }
+
+    rsc->mHal.funcs.allocation.read2D(rsc, this, xoff, yoff, lod, face, w, h, data, sizeBytes);
+}
+
+void Allocation::read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t zoff,
+                      uint32_t lod, RsAllocationCubemapFace face,
+                      uint32_t w, uint32_t h, uint32_t d, void *data, size_t sizeBytes) {
+}
+
 void Allocation::elementData(Context *rsc, uint32_t x, const void *data,
                                 uint32_t cIdx, size_t sizeBytes) {
     size_t eSize = mHal.state.elementSizeBytes;
@@ -199,7 +229,7 @@
     }
 
     ALOGV("%s allocation ptr=%p  mUsageFlags=0x04%x, mMipmapControl=0x%04x",
-         prefix, getPtr(), mHal.state.usageFlags, mHal.state.mipmapControl);
+         prefix, mHal.drvState.mallocPtrLOD0, mHal.state.usageFlags, mHal.state.mipmapControl);
 }
 
 uint32_t Allocation::getPackedSize() const {
@@ -207,7 +237,7 @@
     return numItems * mHal.state.type->getElement()->getSizeBytesUnpadded();
 }
 
-void Allocation::writePackedData(const Type *type,
+void Allocation::writePackedData(Context *rsc, const Type *type,
                                  uint8_t *dst, const uint8_t *src, bool dstPadded) {
     const Element *elem = type->getElement();
     uint32_t unpaddedBytes = elem->getSizeBytesUnpadded();
@@ -256,28 +286,30 @@
     delete[] sizeUnpadded;
 }
 
-void Allocation::unpackVec3Allocation(const void *data, size_t dataSize) {
+void Allocation::unpackVec3Allocation(Context *rsc, const void *data, size_t dataSize) {
     const uint8_t *src = (const uint8_t*)data;
-    uint8_t *dst = (uint8_t*)getPtr();
+    uint8_t *dst = (uint8_t *)rsc->mHal.funcs.allocation.lock1D(rsc, this);
 
-    writePackedData(getType(), dst, src, true);
+    writePackedData(rsc, getType(), dst, src, true);
+    rsc->mHal.funcs.allocation.unlock1D(rsc, this);
 }
 
-void Allocation::packVec3Allocation(OStream *stream) const {
+void Allocation::packVec3Allocation(Context *rsc, OStream *stream) const {
     uint32_t paddedBytes = getType()->getElement()->getSizeBytes();
     uint32_t unpaddedBytes = getType()->getElement()->getSizeBytesUnpadded();
     uint32_t numItems = mHal.state.type->getSizeBytes() / paddedBytes;
 
-    const uint8_t *src = (const uint8_t*)getPtr();
+    const uint8_t *src = (const uint8_t*)rsc->mHal.funcs.allocation.lock1D(rsc, this);
     uint8_t *dst = new uint8_t[numItems * unpaddedBytes];
 
-    writePackedData(getType(), dst, src, false);
+    writePackedData(rsc, getType(), dst, src, false);
     stream->addByteArray(dst, getPackedSize());
 
     delete[] dst;
+    rsc->mHal.funcs.allocation.unlock1D(rsc, this);
 }
 
-void Allocation::serialize(OStream *stream) const {
+void Allocation::serialize(Context *rsc, OStream *stream) const {
     // Need to identify ourselves
     stream->addU32((uint32_t)getClassId());
 
@@ -286,7 +318,7 @@
 
     // First thing we need to serialize is the type object since it will be needed
     // to initialize the class
-    mHal.state.type->serialize(stream);
+    mHal.state.type->serialize(rsc, stream);
 
     uint32_t dataSize = mHal.state.type->getSizeBytes();
     // 3 element vectors are padded to 4 in memory, but padding isn't serialized
@@ -295,10 +327,11 @@
     stream->addU32(packedSize);
     if (dataSize == packedSize) {
         // Now write the data
-        stream->addByteArray(getPtr(), dataSize);
+        stream->addByteArray(rsc->mHal.funcs.allocation.lock1D(rsc, this), dataSize);
+        rsc->mHal.funcs.allocation.unlock1D(rsc, this);
     } else {
         // Now write the data
-        packVec3Allocation(stream);
+        packVec3Allocation(rsc, stream);
     }
 }
 
@@ -341,7 +374,7 @@
         // Read in all of our allocation data
         alloc->data(rsc, 0, 0, count, stream->getPtr() + stream->getPos(), dataSize);
     } else {
-        alloc->unpackVec3Allocation(stream->getPtr() + stream->getPos(), dataSize);
+        alloc->unpackVec3Allocation(rsc, stream->getPtr() + stream->getPos(), dataSize);
     }
     stream->reset(stream->getPos() + dataSize);
 
@@ -367,7 +400,9 @@
 }
 
 void Allocation::freeChildrenUnlocked () {
-    decRefs(getPtr(), mHal.state.type->getSizeBytes() / mHal.state.type->getElementSizeBytes(), 0);
+    void *ptr = mRSC->mHal.funcs.allocation.lock1D(mRSC, this);
+    decRefs(ptr, mHal.state.type->getSizeBytes() / mHal.state.type->getElementSizeBytes(), 0);
+    mRSC->mHal.funcs.allocation.unlock1D(mRSC, this);
 }
 
 bool Allocation::freeChildren() {
@@ -390,7 +425,8 @@
 
     ObjectBaseRef<Type> t = mHal.state.type->cloneAndResize1D(rsc, dimX);
     if (dimX < oldDimX) {
-        decRefs(getPtr(), oldDimX - dimX, dimX);
+        decRefs(rsc->mHal.funcs.allocation.lock1D(rsc, this), oldDimX - dimX, dimX);
+        rsc->mHal.funcs.allocation.unlock1D(rsc, this);
     }
     rsc->mHal.funcs.allocation.resize(rsc, this, t.get(), mHal.state.hasReferences);
     setType(t.get());
@@ -447,76 +483,6 @@
 namespace android {
 namespace renderscript {
 
-static void AllocationGenerateScriptMips(RsContext con, RsAllocation va);
-
-static void mip565(const Adapter2D &out, const Adapter2D &in) {
-    uint32_t w = out.getDimX();
-    uint32_t h = out.getDimY();
-
-    for (uint32_t y=0; y < h; y++) {
-        uint16_t *oPtr = static_cast<uint16_t *>(out.getElement(0, y));
-        const uint16_t *i1 = static_cast<uint16_t *>(in.getElement(0, y*2));
-        const uint16_t *i2 = static_cast<uint16_t *>(in.getElement(0, y*2+1));
-
-        for (uint32_t x=0; x < w; x++) {
-            *oPtr = rsBoxFilter565(i1[0], i1[1], i2[0], i2[1]);
-            oPtr ++;
-            i1 += 2;
-            i2 += 2;
-        }
-    }
-}
-
-static void mip8888(const Adapter2D &out, const Adapter2D &in) {
-    uint32_t w = out.getDimX();
-    uint32_t h = out.getDimY();
-
-    for (uint32_t y=0; y < h; y++) {
-        uint32_t *oPtr = static_cast<uint32_t *>(out.getElement(0, y));
-        const uint32_t *i1 = static_cast<uint32_t *>(in.getElement(0, y*2));
-        const uint32_t *i2 = static_cast<uint32_t *>(in.getElement(0, y*2+1));
-
-        for (uint32_t x=0; x < w; x++) {
-            *oPtr = rsBoxFilter8888(i1[0], i1[1], i2[0], i2[1]);
-            oPtr ++;
-            i1 += 2;
-            i2 += 2;
-        }
-    }
-}
-
-static void mip8(const Adapter2D &out, const Adapter2D &in) {
-    uint32_t w = out.getDimX();
-    uint32_t h = out.getDimY();
-
-    for (uint32_t y=0; y < h; y++) {
-        uint8_t *oPtr = static_cast<uint8_t *>(out.getElement(0, y));
-        const uint8_t *i1 = static_cast<uint8_t *>(in.getElement(0, y*2));
-        const uint8_t *i2 = static_cast<uint8_t *>(in.getElement(0, y*2+1));
-
-        for (uint32_t x=0; x < w; x++) {
-            *oPtr = (uint8_t)(((uint32_t)i1[0] + i1[1] + i2[0] + i2[1]) * 0.25f);
-            oPtr ++;
-            i1 += 2;
-            i2 += 2;
-        }
-    }
-}
-
-static void mip(const Adapter2D &out, const Adapter2D &in) {
-    switch (out.getBaseType()->getElement()->getSizeBits()) {
-    case 32:
-        mip8888(out, in);
-        break;
-    case 16:
-        mip565(out, in);
-        break;
-    case 8:
-        mip8(out, in);
-        break;
-    }
-}
-
 void rsi_AllocationSyncAll(Context *rsc, RsAllocation va, RsAllocationUsageType src) {
     Allocation *a = static_cast<Allocation *>(va);
     a->sendDirty(rsc);
@@ -524,21 +490,15 @@
 }
 
 void rsi_AllocationGenerateMipmaps(Context *rsc, RsAllocation va) {
-    Allocation *texAlloc = static_cast<Allocation *>(va);
-    AllocationGenerateScriptMips(rsc, texAlloc);
+    Allocation *alloc = static_cast<Allocation *>(va);
+    rsc->mHal.funcs.allocation.generateMipmaps(rsc, alloc);
 }
 
-void rsi_AllocationCopyToBitmap(Context *rsc, RsAllocation va, void *data, size_t dataLen) {
-    Allocation *texAlloc = static_cast<Allocation *>(va);
-    const Type * t = texAlloc->getType();
-
-    size_t s = t->getDimX() * t->getDimY() * t->getElementSizeBytes();
-    if (s != dataLen) {
-        rsc->setError(RS_ERROR_BAD_VALUE, "Bitmap size didn't match allocation size");
-        return;
-    }
-
-    memcpy(data, texAlloc->getPtr(), s);
+void rsi_AllocationCopyToBitmap(Context *rsc, RsAllocation va, void *data, size_t sizeBytes) {
+    Allocation *a = static_cast<Allocation *>(va);
+    const Type * t = a->getType();
+    a->read(rsc, 0, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
+            t->getDimX(), t->getDimY(), data, sizeBytes);
 }
 
 void rsi_Allocation1DData(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t lod,
@@ -565,9 +525,16 @@
     a->data(rsc, xoff, yoff, lod, face, w, h, data, sizeBytes);
 }
 
-void rsi_AllocationRead(Context *rsc, RsAllocation va, void *data, size_t data_length) {
+void rsi_AllocationRead(Context *rsc, RsAllocation va, void *data, size_t sizeBytes) {
     Allocation *a = static_cast<Allocation *>(va);
-    a->read(data);
+    const Type * t = a->getType();
+    if(t->getDimY()) {
+        a->read(rsc, 0, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
+                t->getDimX(), t->getDimY(), data, sizeBytes);
+    } else {
+        a->read(rsc, 0, 0, t->getDimX(), data, sizeBytes);
+    }
+
 }
 
 void rsi_AllocationResize1D(Context *rsc, RsAllocation va, uint32_t dimX) {
@@ -580,23 +547,6 @@
     a->resize2D(rsc, dimX, dimY);
 }
 
-static void AllocationGenerateScriptMips(RsContext con, RsAllocation va) {
-    Context *rsc = static_cast<Context *>(con);
-    Allocation *texAlloc = static_cast<Allocation *>(va);
-    uint32_t numFaces = texAlloc->getType()->getDimFaces() ? 6 : 1;
-    for (uint32_t face = 0; face < numFaces; face ++) {
-        Adapter2D adapt(rsc, texAlloc);
-        Adapter2D adapt2(rsc, texAlloc);
-        adapt.setFace(face);
-        adapt2.setFace(face);
-        for (uint32_t lod=0; lod < (texAlloc->getType()->getLODCount() -1); lod++) {
-            adapt.setLOD(lod);
-            adapt2.setLOD(lod + 1);
-            mip(adapt2, adapt);
-        }
-    }
-}
-
 RsAllocation rsi_AllocationCreateTyped(Context *rsc, RsType vtype,
                                        RsAllocationMipmapControl mips,
                                        uint32_t usages, uint32_t ptr) {
@@ -610,7 +560,7 @@
 
 RsAllocation rsi_AllocationCreateFromBitmap(Context *rsc, RsType vtype,
                                             RsAllocationMipmapControl mips,
-                                            const void *data, size_t data_length, uint32_t usages) {
+                                            const void *data, size_t sizeBytes, uint32_t usages) {
     Type *t = static_cast<Type *>(vtype);
 
     RsAllocation vTexAlloc = rsi_AllocationCreateTyped(rsc, vtype, mips, usages, 0);
@@ -620,9 +570,10 @@
         return NULL;
     }
 
-    memcpy(texAlloc->getPtr(), data, t->getDimX() * t->getDimY() * t->getElementSizeBytes());
+    texAlloc->data(rsc, 0, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
+                   t->getDimX(), t->getDimY(), data, sizeBytes);
     if (mips == RS_ALLOCATION_MIPMAP_FULL) {
-        AllocationGenerateScriptMips(rsc, texAlloc);
+        rsc->mHal.funcs.allocation.generateMipmaps(rsc, texAlloc);
     }
 
     texAlloc->sendDirty(rsc);
@@ -631,7 +582,7 @@
 
 RsAllocation rsi_AllocationCubeCreateFromBitmap(Context *rsc, RsType vtype,
                                                 RsAllocationMipmapControl mips,
-                                                const void *data, size_t data_length, uint32_t usages) {
+                                                const void *data, size_t sizeBytes, uint32_t usages) {
     Type *t = static_cast<Type *>(vtype);
 
     // Cubemap allocation's faces should be Width by Width each.
@@ -650,11 +601,9 @@
 
     uint8_t *sourcePtr = (uint8_t*)data;
     for (uint32_t face = 0; face < 6; face ++) {
-        Adapter2D faceAdapter(rsc, texAlloc);
-        faceAdapter.setFace(face);
-
         for (uint32_t dI = 0; dI < faceSize; dI ++) {
-            memcpy(faceAdapter.getElement(0, dI), sourcePtr + strideBytes * dI, copySize);
+            texAlloc->data(rsc, 0, dI, 0, (RsAllocationCubemapFace)face,
+                           t->getDimX(), 1, sourcePtr + strideBytes * dI, copySize);
         }
 
         // Move the data pointer to the next cube face
@@ -662,7 +611,7 @@
     }
 
     if (mips == RS_ALLOCATION_MIPMAP_FULL) {
-        AllocationGenerateScriptMips(rsc, texAlloc);
+        rsc->mHal.funcs.allocation.generateMipmaps(rsc, texAlloc);
     }
 
     texAlloc->sendDirty(rsc);
diff --git a/rsAllocation.h b/rsAllocation.h
index dce09ed..4fccf9d 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -41,6 +41,8 @@
     // The graphics equivalent of malloc.  The allocation contains a structure of elements.
 
 public:
+    const static int MAX_LOD = 16;
+
     struct Hal {
         void * drv;
 
@@ -63,12 +65,13 @@
             int32_t surfaceTextureID;
             ANativeWindow *wndSurface;
             SurfaceTexture *surfaceTexture;
+            RsDataType eType;
         };
         State state;
 
         struct DrvState {
-            mutable void * mallocPtr;
-            mutable uint32_t stride;
+            mutable void * mallocPtrLOD0;
+            mutable uint32_t strideLOD0;
         } drvState;
 
     };
@@ -80,7 +83,6 @@
     virtual ~Allocation();
     void updateCache();
 
-    void * getPtr() const {return mHal.drvState.mallocPtr;}
     const Type * getType() const {return mHal.state.type;}
 
     void syncAll(Context *rsc, RsAllocationUsageType src);
@@ -96,18 +98,22 @@
     void data(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t zoff, uint32_t lod, RsAllocationCubemapFace face,
                  uint32_t w, uint32_t h, uint32_t d, const void *data, size_t sizeBytes);
 
+    void read(Context *rsc, uint32_t xoff, uint32_t lod, uint32_t count, void *data, size_t sizeBytes);
+    void read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+                 uint32_t w, uint32_t h, void *data, size_t sizeBytes);
+    void read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t zoff, uint32_t lod, RsAllocationCubemapFace face,
+                 uint32_t w, uint32_t h, uint32_t d, void *data, size_t sizeBytes);
+
     void elementData(Context *rsc, uint32_t x,
                      const void *data, uint32_t elementOff, size_t sizeBytes);
     void elementData(Context *rsc, uint32_t x, uint32_t y,
                      const void *data, uint32_t elementOff, size_t sizeBytes);
 
-    void read(void *data);
-
     void addProgramToDirty(const Program *);
     void removeProgramToDirty(const Program *);
 
     virtual void dumpLOGV(const char *prefix) const;
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_ALLOCATION; }
     static Allocation *createFromStream(Context *rsc, IStream *stream);
 
@@ -152,9 +158,10 @@
     Allocation(Context *rsc, const Type *, uint32_t usages, RsAllocationMipmapControl mc, void *ptr);
 
     uint32_t getPackedSize() const;
-    static void writePackedData(const Type *type, uint8_t *dst, const uint8_t *src, bool dstPadded);
-    void unpackVec3Allocation(const void *data, size_t dataSize);
-    void packVec3Allocation(OStream *stream) const;
+    static void writePackedData(Context *rsc, const Type *type, uint8_t *dst,
+                                const uint8_t *src, bool dstPadded);
+    void unpackVec3Allocation(Context *rsc, const void *data, size_t dataSize);
+    void packVec3Allocation(Context *rsc, OStream *stream) const;
 };
 
 }
diff --git a/rsAnimation.cpp b/rsAnimation.cpp
index a4093d9..f6da138 100644
--- a/rsAnimation.cpp
+++ b/rsAnimation.cpp
@@ -21,7 +21,7 @@
 using namespace android;
 using namespace android::renderscript;
 
-void Animation::serialize(OStream *stream) const {
+void Animation::serialize(Context *rsc, OStream *stream) const {
 }
 
 Animation *Animation::createFromStream(Context *rsc, IStream *stream) {
diff --git a/rsAnimation.h b/rsAnimation.h
index 526a081..4e0cc89 100644
--- a/rsAnimation.h
+++ b/rsAnimation.h
@@ -36,7 +36,7 @@
 
     float eval(float) const;
 
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_ANIMATION; }
     static Animation *createFromStream(Context *rsc, IStream *stream);
 
diff --git a/rsDefines.h b/rsDefines.h
index 854df08..c3540f1 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -333,6 +333,12 @@
     RS_CULL_INVALID = 100,
 };
 
+enum RsScriptIntrinsicID {
+    RS_SCRIPT_INTRINSIC_ID_UNDEFINED = 0,
+    RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3 = 1,
+    RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5 = 2
+};
+
 typedef struct {
     RsA3DClassID classID;
     const char* objectName;
diff --git a/rsElement.cpp b/rsElement.cpp
index fb2892c..3126c28 100644
--- a/rsElement.cpp
+++ b/rsElement.cpp
@@ -92,7 +92,7 @@
     }
 }
 
-void Element::serialize(OStream *stream) const {
+void Element::serialize(Context *rsc, OStream *stream) const {
     // Need to identify ourselves
     stream->addU32((uint32_t)getClassId());
 
@@ -106,7 +106,7 @@
     for (uint32_t ct = 0; ct < mFieldCount; ct++) {
         stream->addString(&mFields[ct].name);
         stream->addU32(mFields[ct].arraySize);
-        mFields[ct].e->serialize(stream);
+        mFields[ct].e->serialize(rsc, stream);
     }
 }
 
@@ -130,7 +130,7 @@
                                           component.getType(),
                                           component.getKind(),
                                           component.getIsNormalized(),
-                                          component.getVectorSize());;
+                                          component.getVectorSize());
     }
 
     const Element **subElems = new const Element *[fieldCount];
diff --git a/rsElement.h b/rsElement.h
index b86d3bc..57698f4 100644
--- a/rsElement.h
+++ b/rsElement.h
@@ -105,7 +105,7 @@
     uint32_t getBitsUnpadded() const {return mBitsUnpadded;}
 
     void dumpLOGV(const char *prefix) const;
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_ELEMENT; }
     static Element *createFromStream(Context *rsc, IStream *stream);
 
diff --git a/rsFileA3D.cpp b/rsFileA3D.cpp
index a52bf7e..07c413f 100644
--- a/rsFileA3D.cpp
+++ b/rsFileA3D.cpp
@@ -352,7 +352,7 @@
     return true;
 }
 
-void FileA3D::appendToFile(ObjectBase *obj) {
+void FileA3D::appendToFile(Context *con, ObjectBase *obj) {
     if (!obj) {
         return;
     }
@@ -366,7 +366,7 @@
     indexEntry->mOffset = mWriteStream->getPos();
     indexEntry->mRsObj = obj;
     mWriteIndex.push(indexEntry);
-    obj->serialize(mWriteStream);
+    obj->serialize(con, mWriteStream);
     indexEntry->mLength = mWriteStream->getPos() - indexEntry->mOffset;
     mWriteStream->align(4);
 }
diff --git a/rsFileA3D.h b/rsFileA3D.h
index cc38c8a..06b90d7 100644
--- a/rsFileA3D.h
+++ b/rsFileA3D.h
@@ -66,12 +66,12 @@
     const A3DIndexEntry* getIndexEntry(size_t index) const;
     ObjectBase *initializeFromEntry(size_t index);
 
-    void appendToFile(ObjectBase *obj);
+    void appendToFile(Context *rsc, ObjectBase *obj);
     bool writeFile(const char *filename);
 
     // Currently files do not get serialized,
     // but we need to inherit from ObjectBase for ref tracking
-    virtual void serialize(OStream *stream) const {
+    virtual void serialize(Context *rsc, OStream *stream) const {
     }
     virtual RsA3DClassID getClassId() const {
         return RS_A3D_CLASS_ID_UNKNOWN;
diff --git a/rsFont.cpp b/rsFont.cpp
index 1f53c79..82fb90f 100644
--- a/rsFont.cpp
+++ b/rsFont.cpp
@@ -118,7 +118,7 @@
 
     FontState *state = &mRSC->mStateFont;
     uint32_t cacheWidth = state->getCacheTextureType()->getDimX();
-    const uint8_t* cacheBuffer = state->getTextTextureData();
+    const uint8_t* cacheBuffer = state->mCacheBuffer;
 
     uint32_t cacheX = 0, cacheY = 0;
     int32_t bX = 0, bY = 0;
@@ -453,7 +453,7 @@
 
     uint32_t cacheWidth = getCacheTextureType()->getDimX();
 
-    uint8_t *cacheBuffer = (uint8_t*)mTextTexture->getPtr();
+    uint8_t *cacheBuffer = mCacheBuffer;
     uint8_t *bitmapBuffer = bitmap->buffer;
 
     uint32_t cacheX = 0, bX = 0, cacheY = 0, bY = 0;
@@ -467,7 +467,10 @@
     // This will dirty the texture and the shader so next time
     // we draw it will upload the data
 
-    mTextTexture->sendDirty(mRSC);
+    mRSC->mHal.funcs.allocation.data2D(mRSC, mTextTexture.get(), 0, 0, 0,
+        RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X, mCacheWidth, mCacheHeight,
+        mCacheBuffer, mCacheWidth*mCacheHeight);
+
     mFontShaderF->bindTexture(mRSC, 0, mTextTexture.get());
 
     // Some debug code
@@ -539,13 +542,16 @@
                                                                 RS_KIND_PIXEL_A, true, 1);
 
     // We will allocate a texture to initially hold 32 character bitmaps
+    mCacheHeight = 256;
+    mCacheWidth = 1024;
     ObjectBaseRef<Type> texType = Type::getTypeRef(mRSC, alphaElem.get(),
-                                                   1024, 256, 0, false, false);
+                                                   mCacheWidth, mCacheHeight, 0, false, false);
+    mCacheBuffer = new uint8_t[mCacheWidth * mCacheHeight];
+
 
     Allocation *cacheAlloc = Allocation::createAllocation(mRSC, texType.get(),
-                                RS_ALLOCATION_USAGE_SCRIPT | RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE);
+                                RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE);
     mTextTexture.set(cacheAlloc);
-    mTextTexture->syncAll(mRSC, RS_ALLOCATION_USAGE_SCRIPT);
 
     // Split up our cache texture into lines of certain widths
     int32_t nextLine = 0;
@@ -574,7 +580,7 @@
     Allocation *indexAlloc = Allocation::createAllocation(mRSC, indexType.get(),
                                                           RS_ALLOCATION_USAGE_SCRIPT |
                                                           RS_ALLOCATION_USAGE_GRAPHICS_VERTEX);
-    uint16_t *indexPtr = (uint16_t*)indexAlloc->getPtr();
+    uint16_t *indexPtr = (uint16_t*)mRSC->mHal.funcs.allocation.lock1D(mRSC, indexAlloc);
 
     // Four verts, two triangles , six indices per quad
     for (uint32_t i = 0; i < mMaxNumberOfQuads; i ++) {
@@ -606,12 +612,14 @@
 
     Allocation *vertexAlloc = Allocation::createAllocation(mRSC, vertexDataType.get(),
                                                            RS_ALLOCATION_USAGE_SCRIPT);
-    mTextMeshPtr = (float*)vertexAlloc->getPtr();
+    mTextMeshPtr = (float*)mRSC->mHal.funcs.allocation.lock1D(mRSC, vertexAlloc);
 
     mMesh.set(new Mesh(mRSC, 1, 1));
     mMesh->setVertexBuffer(vertexAlloc, 0);
     mMesh->setPrimitive(indexAlloc, RS_PRIMITIVE_TRIANGLE, 0);
     mMesh->init();
+    mRSC->mHal.funcs.allocation.unlock1D(mRSC, indexAlloc);
+    mRSC->mHal.funcs.allocation.unlock1D(mRSC, vertexAlloc);
 }
 
 // We don't want to allocate anything unless we actually draw text
diff --git a/rsFont.h b/rsFont.h
index 2bd30b7..8f43a2a 100644
--- a/rsFont.h
+++ b/rsFont.h
@@ -67,7 +67,7 @@
 
     // Currently files do not get serialized,
     // but we need to inherit from ObjectBase for ref tracking
-    virtual void serialize(OStream *stream) const {
+    virtual void serialize(Context *rsc, OStream *stream) const {
     }
     virtual RsA3DClassID getClassId() const {
         return RS_A3D_CLASS_ID_UNKNOWN;
@@ -215,10 +215,11 @@
 
     // Texture to cache glyph bitmaps
     ObjectBaseRef<Allocation> mTextTexture;
+    uint8_t *mCacheBuffer;
+    uint32_t mCacheWidth;
+    uint32_t mCacheHeight;
+
     void initTextTexture();
-    const uint8_t* getTextTextureData() const {
-        return (uint8_t*)mTextTexture->getPtr();
-    }
 
 #ifndef ANDROID_RS_SERIALIZE
     bool cacheBitmap(FT_Bitmap_ *bitmap, uint32_t *retOriginX, uint32_t *retOriginY);
diff --git a/rsMatrix2x2.h b/rsMatrix2x2.h
index 4dcb84a..4fbd1c2 100644
--- a/rsMatrix2x2.h
+++ b/rsMatrix2x2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 The Android Open Source Project
+ * Copyright (C) 2011-2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,12 +25,12 @@
 namespace renderscript {
 
 struct Matrix2x2 : public rs_matrix2x2 {
-    inline float get(uint32_t row, uint32_t col) const {
-        return m[row*2 + col];
+    inline float get(uint32_t x, uint32_t y) const {
+        return m[x*2 + y];
     }
 
-    inline void set(uint32_t row, uint32_t col, float v) {
-        m[row*2 + col] = v;
+    inline void set(uint32_t x, uint32_t y, float v) {
+        m[x*2 + y] = v;
     }
 
     void loadIdentity();
@@ -51,12 +51,4 @@
 }
 }
 
-
-
-
-#endif
-
-
-
-
-
+#endif  // ANDROID_RS_MATRIX_2x2_H
diff --git a/rsMatrix3x3.h b/rsMatrix3x3.h
index f96d270..05249b1 100644
--- a/rsMatrix3x3.h
+++ b/rsMatrix3x3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 The Android Open Source Project
+ * Copyright (C) 2011-2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,12 +25,12 @@
 namespace renderscript {
 
 struct Matrix3x3 : public rs_matrix3x3 {
-    inline float get(uint32_t row, uint32_t col) const {
-        return m[row*3 + col];
+    inline float get(uint32_t x, uint32_t y) const {
+        return m[x*3 + y];
     }
 
-    inline void set(uint32_t row, uint32_t col, float v) {
-        m[row*3 + col] = v;
+    inline void set(uint32_t x, uint32_t y, float v) {
+        m[x*3 + y] = v;
     }
 
     void loadIdentity();
@@ -51,12 +51,4 @@
 }
 }
 
-
-
-
-#endif
-
-
-
-
-
+#endif  // ANDROID_RS_MATRIX_3x3_H
diff --git a/rsMatrix4x4.h b/rsMatrix4x4.h
index d30184f..44c33d1 100644
--- a/rsMatrix4x4.h
+++ b/rsMatrix4x4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009 The Android Open Source Project
+ * Copyright (C) 2009-2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,12 +25,12 @@
 namespace renderscript {
 
 struct Matrix4x4 : public rs_matrix4x4 {
-    float get(uint32_t row, uint32_t col) const {
-        return m[row*4 + col];
+    float get(uint32_t x, uint32_t y) const {
+        return m[x*4 + y];
     }
 
-    void set(uint32_t row, uint32_t col, float v) {
-        m[row*4 + col] = v;
+    void set(uint32_t x, uint32_t y, float v) {
+        m[x*4 + y] = v;
     }
 
     void loadIdentity();
@@ -82,11 +82,4 @@
 }
 }
 
-
-
-
-#endif
-
-
-
-
+#endif  // ANDROID_RS_MATRIX_4x4_H
diff --git a/rsMesh.cpp b/rsMesh.cpp
index 399a52b..651a8f3 100644
--- a/rsMesh.cpp
+++ b/rsMesh.cpp
@@ -78,7 +78,7 @@
 #endif
 }
 
-void Mesh::serialize(OStream *stream) const {
+void Mesh::serialize(Context *rsc, OStream *stream) const {
     // Need to identify ourselves
     stream->addU32((uint32_t)getClassId());
 
@@ -88,7 +88,7 @@
     // Store number of vertex streams
     stream->addU32(mHal.state.vertexBuffersCount);
     for (uint32_t vCount = 0; vCount < mHal.state.vertexBuffersCount; vCount ++) {
-        mHal.state.vertexBuffers[vCount]->serialize(stream);
+        mHal.state.vertexBuffers[vCount]->serialize(rsc, stream);
     }
 
     stream->addU32(mHal.state.primitivesCount);
@@ -98,7 +98,7 @@
 
         if (mHal.state.indexBuffers[pCount]) {
             stream->addU32(1);
-            mHal.state.indexBuffers[pCount]->serialize(stream);
+            mHal.state.indexBuffers[pCount]->serialize(rsc, stream);
         } else {
             stream->addU32(0);
         }
@@ -215,11 +215,12 @@
     }
 }
 
-void Mesh::computeBBox() {
+void Mesh::computeBBox(Context *rsc) {
     float *posPtr = NULL;
     uint32_t vectorSize = 0;
     uint32_t stride = 0;
     uint32_t numVerts = 0;
+    Allocation *posAlloc = NULL;
     // First we need to find the position ptr and stride
     for (uint32_t ct=0; ct < mHal.state.vertexBuffersCount; ct++) {
         const Type *bufferType = mHal.state.vertexBuffers[ct]->getType();
@@ -230,7 +231,10 @@
                 vectorSize = bufferElem->getField(ct)->getComponent().getVectorSize();
                 stride = bufferElem->getSizeBytes() / sizeof(float);
                 uint32_t offset = bufferElem->getFieldOffsetBytes(ct);
-                posPtr = (float*)((uint8_t*)mHal.state.vertexBuffers[ct]->getPtr() + offset);
+                posAlloc = mHal.state.vertexBuffers[ct];
+                const uint8_t *bp = (const uint8_t *)rsc->mHal.funcs.allocation.lock1D(
+                        rsc, posAlloc);
+                posPtr = (float*)(bp + offset);
                 numVerts = bufferType->getDimX();
                 break;
             }
@@ -256,6 +260,10 @@
         }
         posPtr += stride;
     }
+
+    if (posAlloc) {
+        rsc->mHal.funcs.allocation.unlock1D(rsc, posAlloc);
+    }
 }
 
 namespace android {
diff --git a/rsMesh.h b/rsMesh.h
index 7ca63cf..9b61ebe 100644
--- a/rsMesh.h
+++ b/rsMesh.h
@@ -59,7 +59,7 @@
     Mesh(Context *, uint32_t vertexBuffersCount, uint32_t primitivesCount);
     ~Mesh();
 
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_MESH; }
     static Mesh *createFromStream(Context *rsc, IStream *stream);
     void init();
@@ -83,7 +83,7 @@
     // Bounding volumes
     float mBBoxMin[3];
     float mBBoxMax[3];
-    void computeBBox();
+    void computeBBox(Context *rsc);
 protected:
     ObjectBaseRef<Allocation> *mVertexBuffers;
     ObjectBaseRef<Allocation> *mIndexBuffers;
diff --git a/rsObjectBase.h b/rsObjectBase.h
index 586da19..f16acd9 100644
--- a/rsObjectBase.h
+++ b/rsObjectBase.h
@@ -58,7 +58,7 @@
     static void dumpAll(Context *rsc);
 
     virtual void dumpLOGV(const char *prefix) const;
-    virtual void serialize(OStream *stream) const = 0;
+    virtual void serialize(Context *rsc, OStream *stream) const = 0;
     virtual RsA3DClassID getClassId() const = 0;
 
     static bool isValid(const Context *rsc, const ObjectBase *obj);
diff --git a/rsPath.cpp b/rsPath.cpp
index 055bb86..bcf4b8e 100644
--- a/rsPath.cpp
+++ b/rsPath.cpp
@@ -59,7 +59,7 @@
 void Path::render(Context *rsc) {
 }
 
-void Path::serialize(OStream *stream) const {
+void Path::serialize(Context *rsc, OStream *stream) const {
 
 }
 
diff --git a/rsPath.h b/rsPath.h
index 1abfc9a..ea14335 100644
--- a/rsPath.h
+++ b/rsPath.h
@@ -43,7 +43,7 @@
     ~Path();
 
     void render(Context *);
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const;
 
 private:
diff --git a/rsProgramFragment.cpp b/rsProgramFragment.cpp
index bebde1e..438b620 100644
--- a/rsProgramFragment.cpp
+++ b/rsProgramFragment.cpp
@@ -53,8 +53,10 @@
     mConstantColor[1] = g;
     mConstantColor[2] = b;
     mConstantColor[3] = a;
-    memcpy(mHal.state.constants[0]->getPtr(), mConstantColor, 4*sizeof(float));
+    void *p = rsc->mHal.funcs.allocation.lock1D(rsc, mHal.state.constants[0]);
+    memcpy(p, mConstantColor, 4*sizeof(float));
     mDirty = true;
+    rsc->mHal.funcs.allocation.unlock1D(rsc, mHal.state.constants[0]);
 }
 
 void ProgramFragment::setup(Context *rsc, ProgramFragmentState *state) {
@@ -74,7 +76,7 @@
     rsc->mHal.funcs.fragment.setActive(rsc, this);
 }
 
-void ProgramFragment::serialize(OStream *stream) const {
+void ProgramFragment::serialize(Context *rsc, OStream *stream) const {
 }
 
 ProgramFragment *ProgramFragment::createFromStream(Context *rsc, IStream *stream) {
diff --git a/rsProgramFragment.h b/rsProgramFragment.h
index 4eb28e7..d580252 100644
--- a/rsProgramFragment.h
+++ b/rsProgramFragment.h
@@ -34,7 +34,7 @@
 
     virtual void setup(Context *, ProgramFragmentState *);
 
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_PROGRAM_FRAGMENT; }
     static ProgramFragment *createFromStream(Context *rsc, IStream *stream);
 
diff --git a/rsProgramRaster.cpp b/rsProgramRaster.cpp
index 94bfe42..4f27f2e 100644
--- a/rsProgramRaster.cpp
+++ b/rsProgramRaster.cpp
@@ -53,7 +53,7 @@
     rsc->mHal.funcs.raster.setActive(rsc, this);
 }
 
-void ProgramRaster::serialize(OStream *stream) const {
+void ProgramRaster::serialize(Context *rsc, OStream *stream) const {
 }
 
 ProgramRaster *ProgramRaster::createFromStream(Context *rsc, IStream *stream) {
diff --git a/rsProgramRaster.h b/rsProgramRaster.h
index c552ea3..e9a524b 100644
--- a/rsProgramRaster.h
+++ b/rsProgramRaster.h
@@ -46,7 +46,7 @@
     Hal mHal;
 
     virtual void setup(const Context *, ProgramRasterState *);
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_PROGRAM_RASTER; }
     static ProgramRaster *createFromStream(Context *rsc, IStream *stream);
 
diff --git a/rsProgramStore.cpp b/rsProgramStore.cpp
index 7e25a22..83c1f2c 100644
--- a/rsProgramStore.cpp
+++ b/rsProgramStore.cpp
@@ -63,7 +63,7 @@
     rsc->mHal.funcs.store.setActive(rsc, this);
 }
 
-void ProgramStore::serialize(OStream *stream) const {
+void ProgramStore::serialize(Context *rsc, OStream *stream) const {
 }
 
 ProgramStore *ProgramStore::createFromStream(Context *rsc, IStream *stream) {
diff --git a/rsProgramStore.h b/rsProgramStore.h
index 9bb2795..9a7f7f1 100644
--- a/rsProgramStore.h
+++ b/rsProgramStore.h
@@ -59,7 +59,7 @@
 
     virtual void setup(const Context *, ProgramStoreState *);
 
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_PROGRAM_STORE; }
     static ProgramStore *createFromStream(Context *rsc, IStream *stream);
     static ObjectBaseRef<ProgramStore> getProgramStore(Context *,
diff --git a/rsProgramVertex.cpp b/rsProgramVertex.cpp
index 23fcbe7..c2ce7ee 100644
--- a/rsProgramVertex.cpp
+++ b/rsProgramVertex.cpp
@@ -46,7 +46,8 @@
                           "Unable to set fixed function emulation matrices because allocation is missing");
             return;
         }
-        float *f = static_cast<float *>(mHal.state.constants[0]->getPtr());
+        float *f = static_cast<float *>(rsc->mHal.funcs.allocation.lock1D(
+                rsc, mHal.state.constants[0]));
         Matrix4x4 mvp;
         mvp.load(&f[RS_PROGRAM_VERTEX_PROJECTION_OFFSET]);
         Matrix4x4 t;
@@ -55,6 +56,7 @@
         for (uint32_t i = 0; i < 16; i ++) {
             f[RS_PROGRAM_VERTEX_MVP_OFFSET + i] = mvp.m[i];
         }
+        rsc->mHal.funcs.allocation.unlock1D(rsc, mHal.state.constants[0]);
     }
 
     state->mLast.set(this);
@@ -73,9 +75,11 @@
                       "Unable to set fixed function emulation matrix projection because allocation is missing");
         return;
     }
-    float *f = static_cast<float *>(mHal.state.constants[0]->getPtr());
+    float *f = static_cast<float *>(rsc->mHal.funcs.allocation.lock1D(
+                rsc, mHal.state.constants[0]));
     memcpy(&f[RS_PROGRAM_VERTEX_PROJECTION_OFFSET], m, sizeof(rsc_Matrix));
     mDirty = true;
+    rsc->mHal.funcs.allocation.unlock1D(rsc, mHal.state.constants[0]);
 }
 
 void ProgramVertex::setModelviewMatrix(Context *rsc, const rsc_Matrix *m) const {
@@ -89,9 +93,11 @@
                       "Unable to set fixed function emulation matrix modelview because allocation is missing");
         return;
     }
-    float *f = static_cast<float *>(mHal.state.constants[0]->getPtr());
+    float *f = static_cast<float *>(rsc->mHal.funcs.allocation.lock1D(
+                rsc, mHal.state.constants[0]));
     memcpy(&f[RS_PROGRAM_VERTEX_MODELVIEW_OFFSET], m, sizeof(rsc_Matrix));
     mDirty = true;
+    rsc->mHal.funcs.allocation.unlock1D(rsc, mHal.state.constants[0]);
 }
 
 void ProgramVertex::setTextureMatrix(Context *rsc, const rsc_Matrix *m) const {
@@ -105,9 +111,11 @@
                       "Unable to set fixed function emulation matrix texture because allocation is missing");
         return;
     }
-    float *f = static_cast<float *>(mHal.state.constants[0]->getPtr());
+    float *f = static_cast<float *>(rsc->mHal.funcs.allocation.lock1D(
+            rsc, mHal.state.constants[0]));
     memcpy(&f[RS_PROGRAM_VERTEX_TEXTURE_OFFSET], m, sizeof(rsc_Matrix));
     mDirty = true;
+    rsc->mHal.funcs.allocation.unlock1D(rsc, mHal.state.constants[0]);
 }
 
 void ProgramVertex::getProjectionMatrix(Context *rsc, rsc_Matrix *m) const {
@@ -121,22 +129,26 @@
                       "Unable to get fixed function emulation matrix projection because allocation is missing");
         return;
     }
-    float *f = static_cast<float *>(mHal.state.constants[0]->getPtr());
+    float *f = static_cast<float *>(
+            rsc->mHal.funcs.allocation.lock1D(rsc, mHal.state.constants[0]));
     memcpy(m, &f[RS_PROGRAM_VERTEX_PROJECTION_OFFSET], sizeof(rsc_Matrix));
+    rsc->mHal.funcs.allocation.unlock1D(rsc, mHal.state.constants[0]);
 }
 
 void ProgramVertex::transformToScreen(Context *rsc, float *v4out, const float *v3in) const {
     if (isUserProgram()) {
         return;
     }
-    float *f = static_cast<float *>(mHal.state.constants[0]->getPtr());
+    float *f = static_cast<float *>(
+            rsc->mHal.funcs.allocation.lock1D(rsc, mHal.state.constants[0]));
     Matrix4x4 mvp;
     mvp.loadMultiply((Matrix4x4 *)&f[RS_PROGRAM_VERTEX_MODELVIEW_OFFSET],
                      (Matrix4x4 *)&f[RS_PROGRAM_VERTEX_PROJECTION_OFFSET]);
     mvp.vectorMultiply(v4out, v3in);
+    rsc->mHal.funcs.allocation.unlock1D(rsc, mHal.state.constants[0]);
 }
 
-void ProgramVertex::serialize(OStream *stream) const {
+void ProgramVertex::serialize(Context *rsc, OStream *stream) const {
 }
 
 ProgramVertex *ProgramVertex::createFromStream(Context *rsc, IStream *stream) {
@@ -207,7 +219,7 @@
 }
 
 void ProgramVertexState::updateSize(Context *rsc) {
-    float *f = static_cast<float *>(mDefaultAlloc->getPtr());
+    float *f = static_cast<float *>(rsc->mHal.funcs.allocation.lock1D(rsc, mDefaultAlloc.get()));
 
     float surfaceWidth = (float)rsc->getCurrentSurfaceWidth();
     float surfaceHeight = (float)rsc->getCurrentSurfaceHeight();
@@ -220,6 +232,7 @@
     m.loadIdentity();
     memcpy(&f[RS_PROGRAM_VERTEX_MODELVIEW_OFFSET], m.m, sizeof(m));
     memcpy(&f[RS_PROGRAM_VERTEX_TEXTURE_OFFSET], m.m, sizeof(m));
+    rsc->mHal.funcs.allocation.unlock1D(rsc, mDefaultAlloc.get());
 }
 
 void ProgramVertexState::deinit(Context *rsc) {
diff --git a/rsProgramVertex.h b/rsProgramVertex.h
index 67c2a88..105d065 100644
--- a/rsProgramVertex.h
+++ b/rsProgramVertex.h
@@ -41,7 +41,7 @@
 
     void transformToScreen(Context *, float *v4out, const float *v3in) const;
 
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_PROGRAM_VERTEX; }
     static ProgramVertex *createFromStream(Context *rsc, IStream *stream);
 };
diff --git a/rsRuntime.h b/rsRuntime.h
index 64f2de8..eff691b 100644
--- a/rsRuntime.h
+++ b/rsRuntime.h
@@ -149,7 +149,6 @@
 
 uint32_t rsrToClient(Context *, Script *, int cmdID, void *data, int len);
 uint32_t rsrToClientBlocking(Context *, Script *, int cmdID, void *data, int len);
-const Allocation * rsrGetAllocation(Context *, Script *, const void *ptr);
 
 void rsrAllocationMarkDirty(Context *, Script *, RsAllocation a);
 void rsrAllocationSyncAll(Context *, Script *, Allocation *a, RsAllocationUsageType source);
diff --git a/rsSampler.cpp b/rsSampler.cpp
index c7180bd..fededb1 100644
--- a/rsSampler.cpp
+++ b/rsSampler.cpp
@@ -68,7 +68,7 @@
     ss->mSamplers[slot].clear();
 }
 
-void Sampler::serialize(OStream *stream) const {
+void Sampler::serialize(Context *rsc, OStream *stream) const {
 }
 
 Sampler *Sampler::createFromStream(Context *rsc, IStream *stream) {
diff --git a/rsSampler.h b/rsSampler.h
index dea4cb6..81220a8 100644
--- a/rsSampler.h
+++ b/rsSampler.h
@@ -61,7 +61,7 @@
     void bindToContext(SamplerState *, uint32_t slot);
     void unbindFromContext(SamplerState *);
 
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_SAMPLER; }
     static Sampler *createFromStream(Context *rsc, IStream *stream);
 
diff --git a/rsScript.cpp b/rsScript.cpp
index d39fb5e..25ee1a0 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -48,11 +48,7 @@
     }
 
     mSlots[slot].set(a);
-    if (a != NULL) {
-        mRSC->mHal.funcs.script.setGlobalBind(mRSC, this, slot, a->getPtr());
-    } else {
-        mRSC->mHal.funcs.script.setGlobalBind(mRSC, this, slot, NULL);
-    }
+    mRSC->mHal.funcs.script.setGlobalBind(mRSC, this, slot, a);
 }
 
 void Script::setVar(uint32_t slot, const void *val, size_t len) {
@@ -97,7 +93,6 @@
     Script *s = static_cast<Script *>(vs);
     Allocation *a = static_cast<Allocation *>(va);
     s->setSlot(slot, a);
-    //ALOGE("rsi_ScriptBindAllocation %i  %p  %p", slot, a, a->getPtr());
 }
 
 void rsi_ScriptSetTimeZone(Context * rsc, RsScript vs, const char * timeZone, size_t length) {
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index 79725b9..466c18a 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -62,31 +62,10 @@
 
         if (!mTypes[ct].get())
             continue;
-        void *ptr = NULL;
-        if (mSlots[ct].get()) {
-            ptr = mSlots[ct]->getPtr();
-        }
-
-        rsc->mHal.funcs.script.setGlobalBind(rsc, this, ct, ptr);
+        rsc->mHal.funcs.script.setGlobalBind(rsc, this, ct, mSlots[ct].get());
     }
 }
 
-const Allocation *ScriptC::ptrToAllocation(const void *ptr) const {
-    //ALOGE("ptr to alloc %p", ptr);
-    if (!ptr) {
-        return NULL;
-    }
-    for (uint32_t ct=0; ct < mHal.info.exportedVariableCount; ct++) {
-        if (!mSlots[ct].get())
-            continue;
-        if (mSlots[ct]->getPtr() == ptr) {
-            return mSlots[ct].get();
-        }
-    }
-    ALOGE("ScriptC::ptrToAllocation, failed to find %p", ptr);
-    return NULL;
-}
-
 void ScriptC::setupGLState(Context *rsc) {
     if (mEnviroment.mFragmentStore.get()) {
         rsc->setProgramStore(mEnviroment.mFragmentStore.get());
diff --git a/rsScriptC.h b/rsScriptC.h
index 92e1f4f..6bc41f2 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -38,10 +38,6 @@
     ScriptC(Context *);
     virtual ~ScriptC();
 
-
-    const Allocation *ptrToAllocation(const void *) const;
-
-
     virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len);
 
     virtual uint32_t run(Context *);
@@ -54,7 +50,7 @@
                             size_t usrBytes,
                             const RsScriptCall *sc = NULL);
 
-    virtual void serialize(OStream *stream) const {    }
+    virtual void serialize(Context *rsc, OStream *stream) const {    }
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_SCRIPT_C; }
     static Type *createFromStream(Context *rsc, IStream *stream) { return NULL; }
 
diff --git a/rsScriptC_Lib.cpp b/rsScriptC_Lib.cpp
index 749495d..ac3dd12 100644
--- a/rsScriptC_Lib.cpp
+++ b/rsScriptC_Lib.cpp
@@ -188,10 +188,6 @@
                               srcAlloc, srcXoff, srcYoff, srcMip, srcFace);
 }
 
-const Allocation * rsrGetAllocation(Context *rsc, Script *s, const void *ptr) {
-    ScriptC *sc = (ScriptC *)s;
-    return sc->ptrToAllocation(ptr);
-}
 
 }
 }
diff --git a/rsScriptC_LibGL.cpp b/rsScriptC_LibGL.cpp
index 21b1c42..6a897a3 100644
--- a/rsScriptC_LibGL.cpp
+++ b/rsScriptC_LibGL.cpp
@@ -251,7 +251,7 @@
                                float *minX, float *minY, float *minZ,
                                float *maxX, float *maxY, float *maxZ) {
     CHECK_OBJ(sm);
-    sm->computeBBox();
+    sm->computeBBox(rsc);
     *minX = sm->mBBoxMin[0];
     *minY = sm->mBBoxMin[1];
     *minZ = sm->mBBoxMin[2];
@@ -285,9 +285,10 @@
 }
 
 void rsrDrawTextAlloc(Context *rsc, Script *sc, Allocation *a, int x, int y) {
-    const char *text = (const char *)a->getPtr();
+    const char *text = (const char *)rsc->mHal.funcs.allocation.lock1D(rsc, a);
     size_t allocSize = a->getType()->getSizeBytes();
     rsc->mStateFont.renderText(text, allocSize, x, y);
+    rsc->mHal.funcs.allocation.unlock1D(rsc, a);
 }
 
 void rsrDrawText(Context *rsc, Script *sc, const char *text, int x, int y) {
@@ -314,11 +315,12 @@
 void rsrMeasureTextAlloc(Context *rsc, Script *sc, Allocation *a,
                          int32_t *left, int32_t *right, int32_t *top, int32_t *bottom) {
     CHECK_OBJ(a);
-    const char *text = (const char *)a->getPtr();
+    const char *text = (const char *)rsc->mHal.funcs.allocation.lock1D(rsc, a);
     size_t textLen = a->getType()->getSizeBytes();
     Font::Rect metrics;
     rsc->mStateFont.measureText(text, textLen, &metrics);
     SetMetrics(&metrics, left, right, top, bottom);
+    rsc->mHal.funcs.allocation.unlock1D(rsc, a);
 }
 
 void rsrMeasureText(Context *rsc, Script *sc, const char *text,
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
new file mode 100644
index 0000000..51f0a5d
--- /dev/null
+++ b/rsScriptIntrinsic.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsContext.h"
+#include "rsScriptIntrinsic.h"
+#include <time.h>
+
+using namespace android;
+using namespace android::renderscript;
+
+ScriptIntrinsic::ScriptIntrinsic(Context *rsc) : Script(rsc) {
+}
+
+ScriptIntrinsic::~ScriptIntrinsic() {
+}
+
+bool ScriptIntrinsic::init(Context *rsc, RsScriptIntrinsicID iid, Element *e) {
+    mIntrinsicID = iid;
+    mElement.set(e);
+    mSlots = new ObjectBaseRef<Allocation>[2];
+    mTypes = new ObjectBaseRef<const Type>[2];
+
+    rsc->mHal.funcs.script.initIntrinsic(rsc, this, iid, e);
+
+
+    return true;
+}
+
+bool ScriptIntrinsic::freeChildren() {
+    return false;
+}
+
+void ScriptIntrinsic::setupScript(Context *rsc) {
+}
+
+uint32_t ScriptIntrinsic::run(Context *rsc) {
+    rsAssert(!"ScriptIntrinsic::run - should not happen");
+    return 0;
+}
+
+
+void ScriptIntrinsic::runForEach(Context *rsc,
+                         uint32_t slot,
+                         const Allocation * ain,
+                         Allocation * aout,
+                         const void * usr,
+                         size_t usrBytes,
+                         const RsScriptCall *sc) {
+
+    rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
+}
+
+void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
+}
+
+void ScriptIntrinsic::serialize(Context *rsc, OStream *stream) const {
+}
+
+RsA3DClassID ScriptIntrinsic::getClassId() const {
+    return (RsA3DClassID)0;
+}
+
+
+
+namespace android {
+namespace renderscript {
+
+
+RsScript rsi_ScriptIntrinsicCreate(Context *rsc, uint32_t id, RsElement ve) {
+    ScriptIntrinsic *si = new ScriptIntrinsic(rsc);
+    ALOGE("rsi_ScriptIntrinsicCreate %i", id);
+    if (!si->init(rsc, (RsScriptIntrinsicID)id, (Element *)ve)) {
+        delete si;
+        return NULL;
+    }
+    return si;
+}
+
+}
+}
+
+
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
new file mode 100644
index 0000000..310cbec
--- /dev/null
+++ b/rsScriptIntrinsic.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RS_SCRIPT_INTRINSIC_H
+#define ANDROID_RS_SCRIPT_INTRINSIC_H
+
+#include "rsScript.h"
+
+
+// ---------------------------------------------------------------------------
+namespace android {
+namespace renderscript {
+
+
+class ScriptIntrinsic : public Script {
+public:
+
+    ObjectBaseRef<const Element> mElement;
+
+    ScriptIntrinsic(Context *);
+    virtual ~ScriptIntrinsic();
+
+    bool init(Context *rsc, RsScriptIntrinsicID iid, Element *e);
+
+
+    virtual void serialize(Context *rsc, OStream *stream) const;
+    virtual RsA3DClassID getClassId() const;
+    virtual bool freeChildren();
+
+    virtual void runForEach(Context *rsc,
+                            uint32_t slot,
+                            const Allocation * ain,
+                            Allocation * aout,
+                            const void * usr,
+                            size_t usrBytes,
+                            const RsScriptCall *sc = NULL);
+
+    virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len);
+    virtual void setupScript(Context *rsc);
+    virtual uint32_t run(Context *);
+protected:
+    uint32_t mIntrinsicID;
+    float mParams[9];
+
+};
+
+
+}
+}
+#endif
+
+
diff --git a/rsType.cpp b/rsType.cpp
index e11b9c1..7ed8d97 100644
--- a/rsType.cpp
+++ b/rsType.cpp
@@ -155,14 +155,14 @@
     mElement->dumpLOGV(buf);
 }
 
-void Type::serialize(OStream *stream) const {
+void Type::serialize(Context *rsc, OStream *stream) const {
     // Need to identify ourselves
     stream->addU32((uint32_t)getClassId());
 
     String8 name(getName());
     stream->addString(&name);
 
-    mElement->serialize(stream);
+    mElement->serialize(rsc, stream);
 
     stream->addU32(mHal.state.dimX);
     stream->addU32(mHal.state.dimY);
diff --git a/rsType.h b/rsType.h
index ed4aa79..1d136b4 100644
--- a/rsType.h
+++ b/rsType.h
@@ -99,7 +99,7 @@
     void compute();
 
     void dumpLOGV(const char *prefix) const;
-    virtual void serialize(OStream *stream) const;
+    virtual void serialize(Context *rsc, OStream *stream) const;
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_TYPE; }
     static Type *createFromStream(Context *rsc, IStream *stream);
 
diff --git a/rsUtils.h b/rsUtils.h
index cbbae6c..ebfc679 100644
--- a/rsUtils.h
+++ b/rsUtils.h
@@ -43,26 +43,6 @@
 #define rsAssert(v) while (0)
 #endif
 
-typedef float rsvF_2 __attribute__ ((vector_size (8)));
-typedef float rsvF_4 __attribute__ ((vector_size (16)));
-typedef uint8_t rsvU8_4 __attribute__ ((vector_size (4)));
-
-union float2 {
-    rsvF_2 v;
-    float f[2];
-};
-
-union float4 {
-    rsvF_4 v;
-    float f[4];
-};
-
-union uchar4 {
-    rsvU8_4 v;
-    uint8_t f[4];
-    uint32_t packed;
-};
-
 template<typename T>
 T rsMin(T in1, T in2)
 {
diff --git a/rs_hal.h b/rs_hal.h
index b4da744..c521ef5 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -47,13 +47,25 @@
     const void *in;
     void *out;
     const void *usr;
-    size_t usr_len;
+    size_t usrLen;
     uint32_t x;
     uint32_t y;
     uint32_t z;
     uint32_t lod;
     RsAllocationCubemapFace face;
     uint32_t ar[16];
+
+    uint32_t dimX;
+    uint32_t dimY;
+    uint32_t dimZ;
+    uint32_t dimArray;
+
+    const uint8_t *ptrIn;
+    uint8_t *ptrOut;
+    uint32_t eStrideIn;
+    uint32_t eStrideOut;
+    uint32_t yStrideIn;
+    uint32_t yStrideOut;
 } RsForEachStubParamStruct;
 
 /**
@@ -78,6 +90,9 @@
                      uint8_t const *bitcode,
                      size_t bitcodeSize,
                      uint32_t flags);
+        bool (*initIntrinsic)(const Context *rsc, Script *s,
+                              RsScriptIntrinsicID iid,
+                              Element *e);
 
         void (*invokeFunction)(const Context *rsc, Script *s,
                                uint32_t slot,
@@ -108,7 +123,7 @@
                                          size_t dimLength);
         void (*setGlobalBind)(const Context *rsc, const Script *s,
                               uint32_t slot,
-                              void *data);
+                              Allocation *data);
         void (*setGlobalObj)(const Context *rsc, const Script *s,
                              uint32_t slot,
                              ObjectBase *data);
@@ -142,6 +157,24 @@
                        uint32_t lod, RsAllocationCubemapFace face,
                        uint32_t w, uint32_t h, uint32_t d, const void *data, size_t sizeBytes);
 
+        void (*read1D)(const Context *rsc, const Allocation *alloc,
+                       uint32_t xoff, uint32_t lod, uint32_t count,
+                       void *data, size_t sizeBytes);
+        void (*read2D)(const Context *rsc, const Allocation *alloc,
+                       uint32_t xoff, uint32_t yoff, uint32_t lod,
+                       RsAllocationCubemapFace face, uint32_t w, uint32_t h,
+                       void *data, size_t sizeBytes);
+        void (*read3D)(const Context *rsc, const Allocation *alloc,
+                       uint32_t xoff, uint32_t yoff, uint32_t zoff,
+                       uint32_t lod, RsAllocationCubemapFace face,
+                       uint32_t w, uint32_t h, uint32_t d, void *data, size_t sizeBytes);
+
+        // Lock and unlock make a 1D region of memory available to the CPU
+        // for direct access by pointer.  Once unlock is called control is
+        // returned to the SOC driver.
+        void * (*lock1D)(const Context *rsc, const Allocation *alloc);
+        void (*unlock1D)(const Context *rsc, const Allocation *alloc);
+
         // Allocation to allocation copies
         void (*allocData1D)(const Context *rsc,
                             const Allocation *dstAlloc,
@@ -168,7 +201,7 @@
         void (*elementData2D)(const Context *rsc, const Allocation *alloc, uint32_t x, uint32_t y,
                               const void *data, uint32_t elementOff, size_t sizeBytes);
 
-
+        void (*generateMipmaps)(const Context *rsc, const Allocation *alloc);
     } allocation;
 
     struct {
diff --git a/scriptc/rs_cl.rsh b/scriptc/rs_cl.rsh
index 45d7818..ad7e56d 100644
--- a/scriptc/rs_cl.rsh
+++ b/scriptc/rs_cl.rsh
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 The Android Open Source Project
+ * Copyright (C) 2011-2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,11 +25,11 @@
 
 // Conversions
 #define CVT_FUNC_2(typeout, typein)                             \
-_RS_RUNTIME typeout##2 __attribute__((overloadable))             \
+_RS_RUNTIME typeout##2 __attribute__((overloadable))            \
         convert_##typeout##2(typein##2 v);                      \
-_RS_RUNTIME typeout##3 __attribute__((overloadable))             \
+_RS_RUNTIME typeout##3 __attribute__((overloadable))            \
         convert_##typeout##3(typein##3 v);                      \
-_RS_RUNTIME typeout##4 __attribute__((overloadable))             \
+_RS_RUNTIME typeout##4 __attribute__((overloadable))            \
         convert_##typeout##4(typein##4 v);
 
 
@@ -41,73 +41,140 @@
                         CVT_FUNC_2(type, int)       \
                         CVT_FUNC_2(type, float)
 
+/**
+ * Convert to char.
+ *
+ * Supports 2,3,4 components of uchar, char, ushort, short, uint, int, float.
+ */
 CVT_FUNC(char)
+
+/**
+ * Convert to unsigned char.
+ *
+ * Supports 2,3,4 components of uchar, char, ushort, short, uint, int, float.
+ */
 CVT_FUNC(uchar)
+
+/**
+ * Convert to short.
+ *
+ * Supports 2,3,4 components of uchar, char, ushort, short, uint, int, float.
+ */
 CVT_FUNC(short)
+
+/**
+ * Convert to unsigned short.
+ *
+ * Supports 2,3,4 components of uchar, char, ushort, short, uint, int, float.
+ */
 CVT_FUNC(ushort)
+
+/**
+ * Convert to int.
+ *
+ * Supports 2,3,4 components of uchar, char, ushort, short, uint, int, float.
+ */
 CVT_FUNC(int)
+
+/**
+ * Convert to unsigned int.
+ *
+ * Supports 2,3,4 components of uchar, char, ushort, short, uint, int, float.
+ */
 CVT_FUNC(uint)
+
+/**
+ * Convert to float.
+ *
+ * Supports 2,3,4 components of uchar, char, ushort, short, uint, int, float.
+ */
 CVT_FUNC(float)
 
 // Float ops, 6.11.2
 
 #define FN_FUNC_FN(fnc)                                         \
-_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v);  \
-_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v);  \
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v); \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v); \
 _RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v);
 
+#define F_FUNC_FN(fnc)                                          \
+_RS_RUNTIME float __attribute__((overloadable)) fnc(float2 v);  \
+_RS_RUNTIME float __attribute__((overloadable)) fnc(float3 v);  \
+_RS_RUNTIME float __attribute__((overloadable)) fnc(float4 v);
+
 #define IN_FUNC_FN(fnc)                                         \
-_RS_RUNTIME int2 __attribute__((overloadable)) fnc(float2 v);    \
-_RS_RUNTIME int3 __attribute__((overloadable)) fnc(float3 v);    \
+_RS_RUNTIME int2 __attribute__((overloadable)) fnc(float2 v);   \
+_RS_RUNTIME int3 __attribute__((overloadable)) fnc(float3 v);   \
 _RS_RUNTIME int4 __attribute__((overloadable)) fnc(float4 v);
 
 #define FN_FUNC_FN_FN(fnc)                                                  \
-_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2);  \
-_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2);  \
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2); \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2); \
 _RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2);
 
+#define F_FUNC_FN_FN(fnc)                                                   \
+_RS_RUNTIME float __attribute__((overloadable)) fnc(float2 v1, float2 v2);  \
+_RS_RUNTIME float __attribute__((overloadable)) fnc(float3 v1, float3 v2);  \
+_RS_RUNTIME float __attribute__((overloadable)) fnc(float4 v1, float4 v2);
+
 #define FN_FUNC_FN_F(fnc)                                                   \
-_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, float v2);   \
-_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, float v2);   \
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, float v2);  \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, float v2);  \
 _RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, float v2);
 
 #define FN_FUNC_FN_IN(fnc)                                                  \
-_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2);    \
-_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2);    \
-_RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2);    \
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2);   \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2);   \
+_RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2);   \
 
 #define FN_FUNC_FN_I(fnc)                                                   \
-_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int v2);     \
-_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int v2);     \
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int v2);    \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int v2);    \
 _RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, int v2);
 
-#define FN_FUNC_FN_PFN(fnc)                     \
-_RS_RUNTIME float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 *v2);             \
-_RS_RUNTIME float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 *v2);             \
-_RS_RUNTIME float4 __attribute__((overloadable)) \
+#define FN_FUNC_FN_PFN(fnc)                         \
+_RS_RUNTIME float2 __attribute__((overloadable))    \
+        fnc(float2 v1, float2 *v2);                 \
+_RS_RUNTIME float3 __attribute__((overloadable))    \
+        fnc(float3 v1, float3 *v2);                 \
+_RS_RUNTIME float4 __attribute__((overloadable))    \
         fnc(float4 v1, float4 *v2);
 
 #define FN_FUNC_FN_PIN(fnc)                                                 \
-_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2);   \
-_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2);   \
+_RS_RUNTIME float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2);  \
+_RS_RUNTIME float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2);  \
 _RS_RUNTIME float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2);
 
-#define FN_FUNC_FN_FN_FN(fnc)                   \
-_RS_RUNTIME float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 v2, float2 v3);   \
-_RS_RUNTIME float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 v2, float3 v3);   \
-_RS_RUNTIME float4 __attribute__((overloadable)) \
+#define FN_FUNC_FN_FN_FN(fnc)                       \
+_RS_RUNTIME float2 __attribute__((overloadable))    \
+        fnc(float2 v1, float2 v2, float2 v3);       \
+_RS_RUNTIME float3 __attribute__((overloadable))    \
+        fnc(float3 v1, float3 v2, float3 v3);       \
+_RS_RUNTIME float4 __attribute__((overloadable))    \
         fnc(float4 v1, float4 v2, float4 v3);
 
-#define FN_FUNC_FN_FN_PIN(fnc)                  \
-_RS_RUNTIME float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 v2, int2 *v3);    \
-_RS_RUNTIME float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 v2, int3 *v3);    \
-_RS_RUNTIME float4 __attribute__((overloadable)) \
+#define FN_FUNC_FN_FN_F(fnc)                        \
+_RS_RUNTIME float2 __attribute__((overloadable))    \
+        fnc(float2 v1, float2 v2, float v3);        \
+_RS_RUNTIME float3 __attribute__((overloadable))    \
+        fnc(float3 v1, float3 v2, float v3);        \
+_RS_RUNTIME float4 __attribute__((overloadable))    \
+        fnc(float4 v1, float4 v2, float v3);
+
+#define FN_FUNC_FN_F_F(fnc)                         \
+_RS_RUNTIME float2 __attribute__((overloadable))    \
+        fnc(float2 v1, float v2, float v3);         \
+_RS_RUNTIME float3 __attribute__((overloadable))    \
+        fnc(float3 v1, float v2, float v3);         \
+_RS_RUNTIME float4 __attribute__((overloadable))    \
+        fnc(float4 v1, float v2, float v3);
+
+#define FN_FUNC_FN_FN_PIN(fnc)                      \
+_RS_RUNTIME float2 __attribute__((overloadable))    \
+        fnc(float2 v1, float2 v2, int2 *v3);        \
+_RS_RUNTIME float3 __attribute__((overloadable))    \
+        fnc(float3 v1, float3 v2, int3 *v3);        \
+_RS_RUNTIME float4 __attribute__((overloadable))    \
         fnc(float4 v1, float4 v2, int4 *v3);
 
 
@@ -491,7 +558,7 @@
 FN_FUNC_FN_FN_FN(mad)
 
 /**
- * Return the integral and fractional components of a number
+ * Return the integral and fractional components of a number.
  * Supports 1,2,3,4 components
  *
  * @param x Source value
@@ -575,7 +642,6 @@
 /**
  * Return (1 / sqrt(value)).
  *
- * @param v The incoming value in radians
  * Supports 1,2,3,4 components
  */
 _RS_RUNTIME float __attribute__((overloadable)) rsqrt(float v);
@@ -663,8 +729,8 @@
 
 #define XN_FUNC_YN(typeout, fnc, typein)                                \
 extern typeout __attribute__((overloadable)) fnc(typein);               \
-_RS_RUNTIME typeout##2 __attribute__((overloadable)) fnc(typein##2 v);   \
-_RS_RUNTIME typeout##3 __attribute__((overloadable)) fnc(typein##3 v);   \
+_RS_RUNTIME typeout##2 __attribute__((overloadable)) fnc(typein##2 v);  \
+_RS_RUNTIME typeout##3 __attribute__((overloadable)) fnc(typein##3 v);  \
 _RS_RUNTIME typeout##4 __attribute__((overloadable)) fnc(typein##4 v);
 
 #define UIN_FUNC_IN(fnc)          \
@@ -682,25 +748,36 @@
 
 
 #define XN_FUNC_XN_XN_BODY(type, fnc, body)         \
-_RS_RUNTIME type __attribute__((overloadable))       \
+_RS_RUNTIME type __attribute__((overloadable))      \
         fnc(type v1, type v2);                      \
-_RS_RUNTIME type##2 __attribute__((overloadable))    \
+_RS_RUNTIME type##2 __attribute__((overloadable))   \
         fnc(type##2 v1, type##2 v2);                \
-_RS_RUNTIME type##3 __attribute__((overloadable))    \
+_RS_RUNTIME type##3 __attribute__((overloadable))   \
         fnc(type##3 v1, type##3 v2);                \
-_RS_RUNTIME type##4 __attribute__((overloadable))    \
+_RS_RUNTIME type##4 __attribute__((overloadable))   \
         fnc(type##4 v1, type##4 v2);
 
-#define IN_FUNC_IN_IN_BODY(fnc, body) \
-XN_FUNC_XN_XN_BODY(uchar, fnc, body)  \
-XN_FUNC_XN_XN_BODY(char, fnc, body)   \
-XN_FUNC_XN_XN_BODY(ushort, fnc, body) \
-XN_FUNC_XN_XN_BODY(short, fnc, body)  \
-XN_FUNC_XN_XN_BODY(uint, fnc, body)   \
-XN_FUNC_XN_XN_BODY(int, fnc, body)    \
+#define IN_FUNC_IN_IN_BODY(fnc, body)   \
+XN_FUNC_XN_XN_BODY(uchar, fnc, body)    \
+XN_FUNC_XN_XN_BODY(char, fnc, body)     \
+XN_FUNC_XN_XN_BODY(ushort, fnc, body)   \
+XN_FUNC_XN_XN_BODY(short, fnc, body)    \
+XN_FUNC_XN_XN_BODY(uint, fnc, body)     \
+XN_FUNC_XN_XN_BODY(int, fnc, body)      \
 XN_FUNC_XN_XN_BODY(float, fnc, body)
 
+/**
+ * Return the absolute value of a value.
+ *
+ * Supports 1,2,3,4 components of char, short, int.
+ */
 UIN_FUNC_IN(abs)
+
+/**
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supports 1,2,3,4 components of uchar, char, ushort, short, uint, int.
+ */
 IN_FUNC_IN(clz)
 
 /**
@@ -727,12 +804,8 @@
  * @param high High bound, must match type of low
  */
 _RS_RUNTIME float __attribute__((overloadable)) clamp(float amount, float low, float high);
-_RS_RUNTIME float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high);
-_RS_RUNTIME float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high);
-_RS_RUNTIME float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
-_RS_RUNTIME float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high);
-_RS_RUNTIME float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high);
-_RS_RUNTIME float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high);
+FN_FUNC_FN_FN_FN(clamp)
+FN_FUNC_FN_F_F(clamp)
 
 /**
  * Convert from radians to degrees.
@@ -748,12 +821,8 @@
  * Supports 1,2,3,4 components
  */
 _RS_RUNTIME float __attribute__((overloadable)) mix(float start, float stop, float amount);
-_RS_RUNTIME float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount);
-_RS_RUNTIME float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount);
-_RS_RUNTIME float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount);
-_RS_RUNTIME float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount);
-_RS_RUNTIME float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount);
-_RS_RUNTIME float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount);
+FN_FUNC_FN_FN_FN(mix)
+FN_FUNC_FN_FN_F(mix)
 
 /**
  * Convert from degrees to radians.
@@ -772,12 +841,8 @@
  * Supports 1,2,3,4 components
  */
 _RS_RUNTIME float __attribute__((overloadable)) step(float edge, float v);
-_RS_RUNTIME float2 __attribute__((overloadable)) step(float2 edge, float2 v);
-_RS_RUNTIME float3 __attribute__((overloadable)) step(float3 edge, float3 v);
-_RS_RUNTIME float4 __attribute__((overloadable)) step(float4 edge, float4 v);
-_RS_RUNTIME float2 __attribute__((overloadable)) step(float2 edge, float v);
-_RS_RUNTIME float3 __attribute__((overloadable)) step(float3 edge, float v);
-_RS_RUNTIME float4 __attribute__((overloadable)) step(float4 edge, float v);
+FN_FUNC_FN_FN(step)
+FN_FUNC_FN_F(step)
 
 // not implemented
 extern float __attribute__((overloadable)) smoothstep(float, float, float);
@@ -789,6 +854,8 @@
 extern float4 __attribute__((overloadable)) smoothstep(float, float, float4);
 
 /**
+ * Return the sign of a value.
+ *
  * if (v < 0) return -1.f;
  * else if (v > 0) return 1.f;
  * else return 0.f;
@@ -812,9 +879,7 @@
  * Supports 1,2,3,4 components
  */
 _RS_RUNTIME float __attribute__((overloadable)) dot(float lhs, float rhs);
-_RS_RUNTIME float __attribute__((overloadable)) dot(float2 lhs, float2 rhs);
-_RS_RUNTIME float __attribute__((overloadable)) dot(float3 lhs, float3 rhs);
-_RS_RUNTIME float __attribute__((overloadable)) dot(float4 lhs, float4 rhs);
+F_FUNC_FN_FN(dot)
 
 /**
  * Compute the length of a vector.
@@ -822,9 +887,7 @@
  * Supports 1,2,3,4 components
  */
 _RS_RUNTIME float __attribute__((overloadable)) length(float v);
-_RS_RUNTIME float __attribute__((overloadable)) length(float2 v);
-_RS_RUNTIME float __attribute__((overloadable)) length(float3 v);
-_RS_RUNTIME float __attribute__((overloadable)) length(float4 v);
+F_FUNC_FN(length)
 
 /**
  * Compute the distance between two points.
@@ -832,9 +895,7 @@
  * Supports 1,2,3,4 components
  */
 _RS_RUNTIME float __attribute__((overloadable)) distance(float lhs, float rhs);
-_RS_RUNTIME float __attribute__((overloadable)) distance(float2 lhs, float2 rhs);
-_RS_RUNTIME float __attribute__((overloadable)) distance(float3 lhs, float3 rhs);
-_RS_RUNTIME float __attribute__((overloadable)) distance(float4 lhs, float4 rhs);
+F_FUNC_FN_FN(distance)
 
 /**
  * Normalize a vector.
@@ -842,21 +903,86 @@
  * Supports 1,2,3,4 components
  */
 _RS_RUNTIME float __attribute__((overloadable)) normalize(float v);
-_RS_RUNTIME float2 __attribute__((overloadable)) normalize(float2 v);
-_RS_RUNTIME float3 __attribute__((overloadable)) normalize(float3 v);
-_RS_RUNTIME float4 __attribute__((overloadable)) normalize(float4 v);
+FN_FUNC_FN(normalize)
+
+
+// New approx API functions
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+
+/**
+ * Return the approximate reciprocal of a value.
+ *
+ * Supports 1,2,3,4 components
+ */
+_RS_RUNTIME float __attribute__((overloadable)) approx_recip(float);
+FN_FUNC_FN(approx_recip)
+
+/**
+ * Return the approximate square root of a value.
+ *
+ * Supports 1,2,3,4 components
+ */
+_RS_RUNTIME float __attribute__((overloadable)) approx_sqrt(float);
+FN_FUNC_FN(approx_sqrt)
+
+/**
+ * Return the approximate value of (1 / sqrt(value)).
+ *
+ * Supports 1,2,3,4 components
+ */
+_RS_RUNTIME float __attribute__((overloadable)) approx_rsqrt(float v);
+FN_FUNC_FN(approx_rsqrt)
+
+/**
+ * Compute the approximate length of a vector.
+ *
+ * Supports 1,2,3,4 components
+ */
+_RS_RUNTIME float __attribute__((overloadable)) approx_length(float v);
+F_FUNC_FN(approx_length)
+
+/**
+ * Compute the approximate distance between two points.
+ *
+ * Supports 1,2,3,4 components
+ */
+_RS_RUNTIME float __attribute__((overloadable)) approx_distance(float lhs, float rhs);
+F_FUNC_FN_FN(approx_distance)
+
+/**
+ * Approximately normalize a vector.
+ *
+ * Supports 1,2,3,4 components
+ */
+_RS_RUNTIME float __attribute__((overloadable)) approx_normalize(float v);
+F_FUNC_FN(approx_normalize)
+
+/**
+ * Compute the approximate arctangent of a value.
+ *
+ * Supports 1,2,3,4 components
+ */
+_RS_RUNTIME float __attribute__((overloadable)) approx_atan(float v);
+FN_FUNC_FN(approx_atan)
+
+#endif  // (defined(RS_VERSION) && (RS_VERSION >= 17))
+
 
 #undef CVT_FUNC
 #undef CVT_FUNC_2
 #undef FN_FUNC_FN
+#undef F_FUNC_FN
 #undef IN_FUNC_FN
 #undef FN_FUNC_FN_FN
+#undef F_FUNC_FN_FN
 #undef FN_FUNC_FN_F
 #undef FN_FUNC_FN_IN
 #undef FN_FUNC_FN_I
 #undef FN_FUNC_FN_PFN
 #undef FN_FUNC_FN_PIN
 #undef FN_FUNC_FN_FN_FN
+#undef FN_FUNC_FN_FN_F
+#undef FN_FUNC_FN_F_F
 #undef FN_FUNC_FN_FN_PIN
 #undef XN_FUNC_YN
 #undef UIN_FUNC_IN
diff --git a/scriptc/rs_debug.rsh b/scriptc/rs_debug.rsh
index 074c28f..7a13c9d 100644
--- a/scriptc/rs_debug.rsh
+++ b/scriptc/rs_debug.rsh
@@ -27,7 +27,6 @@
 #define __RS_DEBUG_RSH__
 
 
-
 /**
  * Debug function.  Prints a string and value to the log.
  */
@@ -52,6 +51,21 @@
  * Debug function.  Prints a string and value to the log.
  */
 extern void __attribute__((overloadable))
+    rsDebug(const char *, float2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
     rsDebug(const char *, double);
 /**
  * Debug function.  Prints a string and value to the log.
@@ -103,21 +117,151 @@
  */
 extern void __attribute__((overloadable))
     rsDebug(const char *, const void *);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned char);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned short);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong4);
+#endif  // (defined(RS_VERSION) && (RS_VERSION >= 17))
+
 #define RS_DEBUG(a) rsDebug(#a, a)
 #define RS_DEBUG_MARKER rsDebug(__FILE__, __LINE__)
 
-
-/**
- * Debug function.  Prints a string and value to the log.
- */
-_RS_RUNTIME void __attribute__((overloadable)) rsDebug(const char *s, float2 v);
-/**
- * Debug function.  Prints a string and value to the log.
- */
-_RS_RUNTIME void __attribute__((overloadable)) rsDebug(const char *s, float3 v);
-/**
- * Debug function.  Prints a string and value to the log.
- */
-_RS_RUNTIME void __attribute__((overloadable)) rsDebug(const char *s, float4 v);
-
 #endif
diff --git a/scriptc/rs_math.rsh b/scriptc/rs_math.rsh
index 8117ca8..73040b3 100644
--- a/scriptc/rs_math.rsh
+++ b/scriptc/rs_math.rsh
@@ -244,5 +244,8 @@
  */
 _RS_RUNTIME float4 rsUnpackColor8888(uchar4 c);
 
+_RS_RUNTIME uchar4 __attribute__((overloadable)) rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
+_RS_RUNTIME float4 __attribute__((overloadable)) rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
+
 
 #endif