Cleanup type offsets which cannot be calculated for flexible YUV.

Support flexible YUV

bug 10567550

Change-Id: I4f6e5a8d86eeee635605460f1751208f3320969b
(cherry picked from commit a75372759e288be3fb8835735a830b1f7d1a4c42)
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 4f56443..7546b38 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -107,6 +107,7 @@
 };
 
 extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
 extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, uint32_t count, const short *param);
 
 void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
@@ -135,91 +136,63 @@
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-    switch (cp->alloc->mHal.state.yuv) {
-    // In API 17 there was no yuv format and the intrinsic treated everything as NV21
-    case 0:
-#if !defined(RS_SERVER)
-    case HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
-#endif
-        {
-            const uchar *pinUV = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
-            size_t strideUV = cp->alloc->mHal.drvState.lod[1].stride;
-            const uchar *uv = pinUV + ((p->y >> 1) * strideUV);
+    const size_t cstep = cp->alloc->mHal.drvState.yuv.step;
 
-            if (pinUV == NULL) {
-                // Legacy yuv support didn't fill in uv
-                strideUV = strideY;
-                uv = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
-                    (strideY * p->dimY) +
-                    ((p->y >> 1) * strideUV);
-            }
+    const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
+    const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
+    const uchar *u = pinU + ((p->y >> 1) * strideU);
 
-            if(x2 > x1) {
-                if (gArchUseSIMD) {
-            #if defined(ARCH_ARM_HAVE_VFP)
-                    int32_t len = (x2 - x1 - 1) >> 3;
-                    if(len > 0) {
-                        //                    ALOGE("%p, %p, %p, %d, %p", out, Y, uv, len, YuvCoeff);
-                        rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
-                        x1 += len << 3;
-                        out += len << 3;
-                    }
-            #endif
+    const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
+    const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
+    const uchar *v = pinV + ((p->y >> 1) * strideV);
+
+    if (pinU == NULL) {
+        // Legacy yuv support didn't fill in uv
+        v = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
+            (strideY * p->dimY) +
+            ((p->y >> 1) * strideY);
+        u = v + 1;
+    }
+
+#if defined(ARCH_ARM_HAVE_VFP)
+    if((x2 > x1) && gArchUseSIMD) {
+        int32_t len = (x2 - x1 - 1) >> 3;
+        if(len > 0) {
+            if (cstep == 1) {
+                rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
+                x1 += len << 3;
+                out += len << 3;
+            } else if (cstep == 2) {
+                // Check for proper interleave
+                intptr_t ipu = (intptr_t)u;
+                intptr_t ipv = (intptr_t)v;
+
+                if (ipu == (ipv + 1)) {
+                    rsdIntrinsicYuv_K(out, Y, v, len, YuvCoeff);
+                    x1 += len << 3;
+                    out += len << 3;
+                } else if (ipu == (ipv - 1)) {
+                    rsdIntrinsicYuvR_K(out, Y, u, len, YuvCoeff);
+                    x1 += len << 3;
+                    out += len << 3;
                 }
 
-               // ALOGE("y %i  %i  %i", p->y, x1, x2);
-                while(x1 < x2) {
-                    uchar u = uv[(x1 & 0xffffe) + 1];
-                    uchar v = uv[(x1 & 0xffffe) + 0];
-                    *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-                    out++;
-                    x1++;
-                    *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-                    out++;
-                    x1++;
-                }
             }
         }
-        break;
-
-#if !defined(RS_SERVER)
-    case HAL_PIXEL_FORMAT_YV12:
-        {
-            const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
-            const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
-            const uchar *u = pinU + ((p->y >> 1) * strideU);
-
-            const uchar *pinV = (const uchar *)cp->alloc->mHal.drvState.lod[2].mallocPtr;
-            const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
-            const uchar *v = pinV + ((p->y >> 1) * strideV);
-
-            if(x2 > x1) {
-        #if defined(ARCH_ARM_HAVE_VFP)
-                if (gArchUseSIMD) {
-                    int32_t len = (x2 - x1 - 1) >> 3;
-                    if(len > 0) {
-                        rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
-                        x1 += len << 3;
-                        out += len << 3;
-                    }
-                }
-        #endif
-
-               // ALOGE("y %i  %i  %i", p->y, x1, x2);
-                while(x1 < x2) {
-                    uchar ut = u[x1];
-                    uchar vt = v[x1];
-                    *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
-                    out++;
-                    x1++;
-                    *out = rsYuvToRGBA_uchar4(Y[x1], ut, vt);
-                    out++;
-                    x1++;
-                }
-            }
-        }
-        break;
+    }
 #endif
+
+    if(x2 > x1) {
+       // ALOGE("y %i  %i  %i", p->y, x1, x2);
+        while(x1 < x2) {
+            int cx = (x1 >> 1) * cstep;
+            *out = rsYuvToRGBA_uchar4(Y[x1], u[cx], v[cx]);
+            out++;
+            x1++;
+            *out = rsYuvToRGBA_uchar4(Y[x1], u[cx], v[cx]);
+            out++;
+            x1++;
+        }
     }
 
 }
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 52fd565..da58f89 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -338,6 +338,97 @@
 END(rsdIntrinsicYuv_K)
 
 /*
+    Function called with the following arguments: dst, Y, vu, len, YuvCoeff
+        r0 = dst
+        r1 = Y
+        r2 = UV
+        r3 = length (pixels / 8)
+        ---- Args below will be in the stack ----
+        sp = YuvCoeff
+
+        This function converts 8 pixels per iteration
+*/
+ENTRY(rsdIntrinsicYuvR_K)
+        push        {r4, r5, lr}            @ preserve clobbered int registers
+        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
+
+        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
+
+        ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
+        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
+        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
+        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
+
+        mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
+
+        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
+                                            @ the coeffs matrix (Q2)
+
+        1:
+        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
+        vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
+        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
+        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
+
+        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
+        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
+        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
+
+        vsubl.u8    Q5, d14, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
+        vsubl.u8    Q6, d12, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
+        vmov.u16    d11, d10                @ Copying V to d11
+        vmov.u16    d13, d12                @ Copying U to d13
+        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
+        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
+
+
+        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
+        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
+        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
+        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
+
+                                            @                  R    G    B
+                                            @     Pixel(0-3)  Q8,  Q9, Q10
+                                            @     Pixel(4-7) Q11, Q12, Q13
+                                            @
+
+                                            @ Pixel(0-3)
+        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
+        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
+        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
+
+                                            @ Pixel(4-7)
+        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
+        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
+        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
+
+                                            @ Pixel(0-3)
+        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
+
+                                            @ Pixel(4-7)
+        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
+
+        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
+
+        subs        r3, r3, #1              @ Checking length (r3)
+        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
+
+        bne 1b                              @ if not done with length, loop
+
+        vpop        {Q4-Q7}                 @ Restore Vregisters
+        pop         {r4, r5, lr}            @ Restore int registers
+        bx          lr
+END(rsdIntrinsicYuvR_K)
+
+/*
     Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
         r0 = dst
         r1 = Y
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 66eddeb..9f217e8 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -243,8 +243,9 @@
         return;
     }
     RSD_CALL_GL(glBindBuffer, drv->glTarget, drv->bufferID);
-    RSD_CALL_GL(glBufferData, drv->glTarget, alloc->mHal.state.type->getSizeBytes(),
-                 alloc->mHal.drvState.lod[0].mallocPtr, GL_DYNAMIC_DRAW);
+    RSD_CALL_GL(glBufferData, drv->glTarget,
+                alloc->mHal.state.type->getPackedSizeBytes(),
+                alloc->mHal.drvState.lod[0].mallocPtr, GL_DYNAMIC_DRAW);
     RSD_CALL_GL(glBindBuffer, drv->glTarget, 0);
     rsdGLCheckError(rsc, "UploadToBufferObject");
 #endif
@@ -255,34 +256,42 @@
     // YUV only supports basic 2d
     // so we can stash the plane pointers in the mipmap levels.
     size_t uvSize = 0;
+    state->lod[1].dimX = state->lod[0].dimX / 2;
+    state->lod[1].dimY = state->lod[0].dimY / 2;
+    state->lod[2].dimX = state->lod[0].dimX / 2;
+    state->lod[2].dimY = state->lod[0].dimY / 2;
+    state->yuv.shift = 1;
+    state->yuv.step = 1;
+    state->lodCount = 3;
+
 #ifndef RS_SERVER
     switch(yuv) {
     case HAL_PIXEL_FORMAT_YV12:
-        state->lod[2].dimX = state->lod[0].dimX / 2;
-        state->lod[2].dimY = state->lod[0].dimY / 2;
         state->lod[2].stride = rsRound(state->lod[0].stride >> 1, 16);
         state->lod[2].mallocPtr = ((uint8_t *)state->lod[0].mallocPtr) +
                 (state->lod[0].stride * state->lod[0].dimY);
         uvSize += state->lod[2].stride * state->lod[2].dimY;
 
-        state->lod[1].dimX = state->lod[2].dimX;
-        state->lod[1].dimY = state->lod[2].dimY;
         state->lod[1].stride = state->lod[2].stride;
         state->lod[1].mallocPtr = ((uint8_t *)state->lod[2].mallocPtr) +
                 (state->lod[2].stride * state->lod[2].dimY);
         uvSize += state->lod[1].stride * state->lod[2].dimY;
-
-        state->lodCount = 3;
         break;
     case HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
-        state->lod[1].dimX = state->lod[0].dimX;
-        state->lod[1].dimY = state->lod[0].dimY / 2;
+        //state->lod[1].dimX = state->lod[0].dimX;
         state->lod[1].stride = state->lod[0].stride;
-        state->lod[1].mallocPtr = ((uint8_t *)state->lod[0].mallocPtr) +
+        state->lod[2].stride = state->lod[0].stride;
+        state->lod[2].mallocPtr = ((uint8_t *)state->lod[0].mallocPtr) +
                 (state->lod[0].stride * state->lod[0].dimY);
+        state->lod[1].mallocPtr = ((uint8_t *)state->lod[2].mallocPtr) + 1;
         uvSize += state->lod[1].stride * state->lod[1].dimY;
-        state->lodCount = 2;
+        state->yuv.step = 2;
         break;
+#ifndef RS_COMPATIBILITY_LIB
+    case HAL_PIXEL_FORMAT_YCbCr_420_888:
+        // This will be filled in by ioReceive()
+        break;
+#endif
     default:
         rsAssert(0);
     }
@@ -766,7 +775,6 @@
     const size_t eSize = alloc->mHal.state.type->getElementSizeBytes();
     uint8_t * ptr = GetOffsetPtr(alloc, xoff, 0, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
     size_t size = count * eSize;
-
     if (ptr != data) {
         // Skip the copy if we are the same allocation. This can arise from
         // our Bitmap optimization, where we share the same storage.
@@ -811,13 +819,20 @@
         }
         if (alloc->mHal.state.yuv) {
             int lod = 1;
-            while (alloc->mHal.drvState.lod[lod].mallocPtr) {
-                size_t lineSize = alloc->mHal.drvState.lod[lod].dimX;
+            int maxLod = 2;
+            if (alloc->mHal.state.yuv == HAL_PIXEL_FORMAT_YV12) {
+                maxLod = 3;
+            } else if (alloc->mHal.state.yuv == HAL_PIXEL_FORMAT_YCrCb_420_SP) {
+                lod = 2;
+                maxLod = 3;
+            }
+
+            while (lod < maxLod) {
                 uint8_t *dst = GetOffsetPtr(alloc, xoff, yoff, 0, lod, face);
 
                 for (uint32_t line=(yoff >> 1); line < ((yoff+h)>>1); line++) {
                     memcpy(dst, src, lineSize);
-                    src += lineSize;
+                    src += alloc->mHal.drvState.lod[lod].stride;
                     dst += alloc->mHal.drvState.lod[lod].stride;
                 }
                 lod++;
diff --git a/driver/runtime/rs_allocation.c b/driver/runtime/rs_allocation.c
index 198e9af..964853b 100644
--- a/driver/runtime/rs_allocation.c
+++ b/driver/runtime/rs_allocation.c
@@ -266,53 +266,27 @@
         rsGetElementAtYuv_uchar_U(rs_allocation a, uint32_t x, uint32_t y) {
 
     Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint32_t yuvID = alloc->mHal.state.yuv;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[1].mallocPtr;
-    const uint32_t stride = alloc->mHal.drvState.lod[1].stride;
 
-    switch(yuvID) {
-    case 0x32315659: //HAL_PIXEL_FORMAT_YV12:
-        x >>= 1;
-        y >>= 1;
-        return p[x + (y * stride)];
-    case 11: //HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
-        x >>= 1;
-        y >>= 1;
-        return p[(x<<1) + (y * stride)];
-    default:
-        break;
-    }
+    const size_t cstep = alloc->mHal.drvState.yuv.step;
+    const size_t shift = alloc->mHal.drvState.yuv.shift;
+    const size_t stride = alloc->mHal.drvState.lod[2].stride;
 
-    return 0;
+    const uchar *pin = (const uchar *)alloc->mHal.drvState.lod[2].mallocPtr;
+
+    return pin[((x >> shift) * cstep) + ((y >> shift) * stride)];
 }
 
 extern const uchar __attribute__((overloadable))
         rsGetElementAtYuv_uchar_V(rs_allocation a, uint32_t x, uint32_t y) {
 
     Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint32_t yuvID = alloc->mHal.state.yuv;
 
-    switch(yuvID) {
-    case 0x32315659: //HAL_PIXEL_FORMAT_YV12:
-        {
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[2].mallocPtr;
-        const uint32_t stride = alloc->mHal.drvState.lod[2].stride;
-        x >>= 1;
-        y >>= 1;
-        return p[x + (y * stride)];
-        }
-    case 11: //HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
-        {
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[1].mallocPtr;
-        const uint32_t stride = alloc->mHal.drvState.lod[1].stride;
-        x >>= 1;
-        y >>= 1;
-        return p[(x<<1) + (y * stride) + 1];
-        }
-    default:
-            break;
-    }
+    const size_t cstep = alloc->mHal.drvState.yuv.step;
+    const size_t shift = alloc->mHal.drvState.yuv.shift;
+    const size_t stride = alloc->mHal.drvState.lod[1].stride;
 
-    return 0;
+    const uchar *pin = (const uchar *)alloc->mHal.drvState.lod[1].mallocPtr;
+
+    return pin[((x >> shift) * cstep) + ((y >> shift) * stride)];
 }
 
diff --git a/driver/runtime/rs_structs.h b/driver/runtime/rs_structs.h
index 204717c..26a41ee 100644
--- a/driver/runtime/rs_structs.h
+++ b/driver/runtime/rs_structs.h
@@ -54,6 +54,11 @@
             size_t faceOffset;
             uint32_t lodCount;
             uint32_t faceCount;
+
+            struct YuvState {
+                uint32_t shift;
+                uint32_t step;
+            } yuv;
         } drvState;
     } mHal;
 } Allocation_t;
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 9bf8709..f6f4ac9 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -256,7 +256,7 @@
 }
 
 uint32_t Allocation::getPackedSize() const {
-    uint32_t numItems = mHal.state.type->getSizeBytes() / mHal.state.type->getElementSizeBytes();
+    uint32_t numItems = mHal.state.type->getCellCount();
     return numItems * mHal.state.type->getElement()->getSizeBytesUnpadded();
 }
 
@@ -265,7 +265,7 @@
     const Element *elem = type->getElement();
     uint32_t unpaddedBytes = elem->getSizeBytesUnpadded();
     uint32_t paddedBytes = elem->getSizeBytes();
-    uint32_t numItems = type->getSizeBytes() / paddedBytes;
+    uint32_t numItems = type->getPackedSizeBytes() / paddedBytes;
 
     uint32_t srcInc = !dstPadded ? paddedBytes : unpaddedBytes;
     uint32_t dstInc =  dstPadded ? paddedBytes : unpaddedBytes;
@@ -320,7 +320,7 @@
 void Allocation::packVec3Allocation(Context *rsc, OStream *stream) const {
     uint32_t paddedBytes = getType()->getElement()->getSizeBytes();
     uint32_t unpaddedBytes = getType()->getElement()->getSizeBytesUnpadded();
-    uint32_t numItems = mHal.state.type->getSizeBytes() / paddedBytes;
+    uint32_t numItems = mHal.state.type->getCellCount();
 
     const uint8_t *src = (const uint8_t*)rsc->mHal.funcs.allocation.lock1D(rsc, this);
     uint8_t *dst = new uint8_t[numItems * unpaddedBytes];
@@ -341,7 +341,7 @@
     // to initialize the class
     mHal.state.type->serialize(rsc, stream);
 
-    uint32_t dataSize = mHal.state.type->getSizeBytes();
+    uint32_t dataSize = mHal.state.type->getPackedSizeBytes();
     // 3 element vectors are padded to 4 in memory, but padding isn't serialized
     uint32_t packedSize = getPackedSize();
     // Write how much data we are storing
@@ -379,7 +379,7 @@
     uint32_t dataSize = stream->loadU32();
     // 3 element vectors are padded to 4 in memory, but padding isn't serialized
     uint32_t packedSize = alloc->getPackedSize();
-    if (dataSize != type->getSizeBytes() &&
+    if (dataSize != type->getPackedSizeBytes() &&
         dataSize != packedSize) {
         ALOGE("failed to read allocation because numbytes written is not the same loaded type wants\n");
         ObjectBase::checkDelete(alloc);
@@ -388,7 +388,7 @@
     }
 
     alloc->assignName(name);
-    if (dataSize == type->getSizeBytes()) {
+    if (dataSize == type->getPackedSizeBytes()) {
         uint32_t count = dataSize / type->getElementSizeBytes();
         // Read in all of our allocation data
         alloc->data(rsc, 0, 0, count, stream->getPtr() + stream->getPos(), dataSize);
@@ -422,7 +422,7 @@
 
 void Allocation::freeChildrenUnlocked () {
     void *ptr = mRSC->mHal.funcs.allocation.lock1D(mRSC, this);
-    decRefs(ptr, mHal.state.type->getSizeBytes() / mHal.state.type->getElementSizeBytes(), 0);
+    decRefs(ptr, mHal.state.type->getCellCount(), 0);
     mRSC->mHal.funcs.allocation.unlock1D(mRSC, this);
 }
 
diff --git a/rsAllocation.h b/rsAllocation.h
index f98fa83..b0f2f9e 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -83,6 +83,11 @@
             size_t faceOffset;
             uint32_t lodCount;
             uint32_t faceCount;
+
+            struct YuvState {
+                uint32_t shift;
+                uint32_t step;
+            } yuv;
         };
         mutable DrvState drvState;
 
diff --git a/rsGrallocConsumer.cpp b/rsGrallocConsumer.cpp
index e3bd9d4..c5d37b2 100644
--- a/rsGrallocConsumer.cpp
+++ b/rsGrallocConsumer.cpp
@@ -41,7 +41,9 @@
     if (y < 1) y = 1;
     mConsumer->setDefaultBufferSize(a->mHal.drvState.lod[0].dimX, y);
 
-    //mBufferQueue->setDefaultBufferFormat(defaultFormat);
+    if (a->mHal.state.yuv) {
+        bq->setDefaultBufferFormat(a->mHal.state.yuv);
+    }
     //mBufferQueue->setConsumerName(name);
 }
 
@@ -146,6 +148,9 @@
         mAlloc->mHal.drvState.lod[0].stride = ycbcr.ystride;
         mAlloc->mHal.drvState.lod[1].stride = ycbcr.cstride;
         mAlloc->mHal.drvState.lod[2].stride = ycbcr.cstride;
+
+        mAlloc->mHal.drvState.yuv.shift = 1;
+        mAlloc->mHal.drvState.yuv.step = ycbcr.chroma_step;
     }
 
     return OK;
diff --git a/rsScriptC_LibGL.cpp b/rsScriptC_LibGL.cpp
index 279ddb2..dbf2336 100644
--- a/rsScriptC_LibGL.cpp
+++ b/rsScriptC_LibGL.cpp
@@ -220,7 +220,7 @@
 
 void rsrDrawTextAlloc(Context *rsc, Allocation *a, int x, int y) {
     const char *text = (const char *)rsc->mHal.funcs.allocation.lock1D(rsc, a);
-    size_t allocSize = a->getType()->getSizeBytes();
+    size_t allocSize = a->getType()->getPackedSizeBytes();
     rsc->mStateFont.renderText(text, allocSize, x, y);
     rsc->mHal.funcs.allocation.unlock1D(rsc, a);
 }
@@ -250,7 +250,7 @@
                          int32_t *left, int32_t *right, int32_t *top, int32_t *bottom) {
     CHECK_OBJ(a);
     const char *text = (const char *)rsc->mHal.funcs.allocation.lock1D(rsc, a);
-    size_t textLen = a->getType()->getSizeBytes();
+    size_t textLen = a->getType()->getPackedSizeBytes();
     Font::Rect metrics;
     rsc->mStateFont.measureText(text, textLen, &metrics);
     SetMetrics(&metrics, left, right, top, bottom);
diff --git a/rsType.cpp b/rsType.cpp
index 33e2ca7..48a18cd 100644
--- a/rsType.cpp
+++ b/rsType.cpp
@@ -50,7 +50,6 @@
         delete [] mHal.state.lodDimX;
         delete [] mHal.state.lodDimY;
         delete [] mHal.state.lodDimZ;
-        delete [] mHal.state.lodOffset;
     }
     mElement.clear();
     memset(&mHal, 0, sizeof(mHal));
@@ -63,11 +62,6 @@
     rsAssert(!mTypes.size());
 }
 
-size_t Type::getOffsetForFace(uint32_t face) const {
-    rsAssert(mHal.state.faces);
-    return 0;
-}
-
 void Type::compute() {
     uint32_t oldLODCount = mHal.state.lodCount;
     if (mDimLOD) {
@@ -85,97 +79,58 @@
             delete [] mHal.state.lodDimX;
             delete [] mHal.state.lodDimY;
             delete [] mHal.state.lodDimZ;
-            delete [] mHal.state.lodOffset;
         }
         mHal.state.lodDimX = new uint32_t[mHal.state.lodCount];
         mHal.state.lodDimY = new uint32_t[mHal.state.lodCount];
         mHal.state.lodDimZ = new uint32_t[mHal.state.lodCount];
-        mHal.state.lodOffset = new uint32_t[mHal.state.lodCount];
     }
 
     uint32_t tx = mHal.state.dimX;
     uint32_t ty = mHal.state.dimY;
     uint32_t tz = mHal.state.dimZ;
-    size_t offset = 0;
+    mCellCount = 0;
     for (uint32_t lod=0; lod < mHal.state.lodCount; lod++) {
         mHal.state.lodDimX[lod] = tx;
         mHal.state.lodDimY[lod] = ty;
         mHal.state.lodDimZ[lod]  = tz;
-        mHal.state.lodOffset[lod] = offset;
-        offset += tx * rsMax(ty, 1u) * rsMax(tz, 1u) * mElement->getSizeBytes();
+        mCellCount += tx * rsMax(ty, 1u) * rsMax(tz, 1u);
         if (tx > 1) tx >>= 1;
         if (ty > 1) ty >>= 1;
         if (tz > 1) tz >>= 1;
     }
 
-    // At this point the offset is the size of a mipmap chain;
-    mMipChainSizeBytes = offset;
-
     if (mHal.state.faces) {
-        offset *= 6;
+        mCellCount *= 6;
     }
 #ifndef RS_SERVER
     // YUV only supports basic 2d
     // so we can stash the plane pointers in the mipmap levels.
     if (mHal.state.dimYuv) {
+        mHal.state.lodDimX[1] = mHal.state.lodDimX[0] / 2;
+        mHal.state.lodDimY[1] = mHal.state.lodDimY[0] / 2;
+        mHal.state.lodDimX[2] = mHal.state.lodDimX[0] / 2;
+        mHal.state.lodDimY[2] = mHal.state.lodDimY[0] / 2;
+        mCellCount += mHal.state.lodDimX[1] * mHal.state.lodDimY[1];
+        mCellCount += mHal.state.lodDimX[2] * mHal.state.lodDimY[2];
+
         switch(mHal.state.dimYuv) {
         case HAL_PIXEL_FORMAT_YV12:
-            mHal.state.lodOffset[1] = offset;
-            mHal.state.lodDimX[1] = mHal.state.lodDimX[0] / 2;
-            mHal.state.lodDimY[1] = mHal.state.lodDimY[0] / 2;
-            offset += offset / 4;
-            mHal.state.lodOffset[2] = offset;
-            mHal.state.lodDimX[2] = mHal.state.lodDimX[0] / 2;
-            mHal.state.lodDimY[2] = mHal.state.lodDimY[0] / 2;
-            offset += offset / 4;
             break;
         case HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
-            mHal.state.lodOffset[1] = offset;
             mHal.state.lodDimX[1] = mHal.state.lodDimX[0];
-            mHal.state.lodDimY[1] = mHal.state.lodDimY[0] / 2;
-            offset += offset / 2;
             break;
+#ifndef RS_COMPATIBILITY_LIB
+        case HAL_PIXEL_FORMAT_YCbCr_420_888:
+            break;
+#endif
         default:
             rsAssert(0);
         }
     }
 #endif
-    mTotalSizeBytes = offset;
     mHal.state.element = mElement.get();
 }
 
-uint32_t Type::getLODOffset(uint32_t lod, uint32_t x) const {
-    uint32_t offset = mHal.state.lodOffset[lod];
-    offset += x * mElement->getSizeBytes();
-    return offset;
-}
-
-uint32_t Type::getLODOffset(uint32_t lod, uint32_t x, uint32_t y) const {
-    uint32_t offset = mHal.state.lodOffset[lod];
-    offset += (x + y * mHal.state.lodDimX[lod]) * mElement->getSizeBytes();
-    return offset;
-}
-
-uint32_t Type::getLODOffset(uint32_t lod, uint32_t x, uint32_t y, uint32_t z) const {
-    uint32_t offset = mHal.state.lodOffset[lod];
-    offset += (x +
-               y * mHal.state.lodDimX[lod] +
-               z * mHal.state.lodDimX[lod] * mHal.state.lodDimY[lod]) * mElement->getSizeBytes();
-    return offset;
-}
-
-uint32_t Type::getLODFaceOffset(uint32_t lod, RsAllocationCubemapFace face,
-                                uint32_t x, uint32_t y) const {
-    uint32_t offset = mHal.state.lodOffset[lod];
-    offset += (x + y * mHal.state.lodDimX[lod]) * mElement->getSizeBytes();
-
-    if (face != 0) {
-        uint32_t faceOffset = getSizeBytes() / 6;
-        offset += faceOffset * face;
-    }
-    return offset;
-}
-
 void Type::dumpLOGV(const char *prefix) const {
     char buf[1024];
     ObjectBase::dumpLOGV(prefix);
diff --git a/rsType.h b/rsType.h
index d2bc96b..26dacfc 100644
--- a/rsType.h
+++ b/rsType.h
@@ -47,7 +47,7 @@
             uint32_t *lodDimX;
             uint32_t *lodDimY;
             uint32_t *lodDimZ;
-            uint32_t *lodOffset;
+            uint32_t *_unused;
             uint32_t lodCount;
             uint32_t dimYuv;
             bool faces;
@@ -58,10 +58,9 @@
 
     Type * createTex2D(const Element *, size_t w, size_t h, bool mip);
 
-    size_t getOffsetForFace(uint32_t face) const;
-
-    size_t getSizeBytes() const {return mTotalSizeBytes;}
+    size_t getCellCount() const {return mCellCount;}
     size_t getElementSizeBytes() const {return mElement->getSizeBytes();}
+    size_t getPackedSizeBytes() const {return mCellCount * mElement->getSizeBytes();}
     const Element * getElement() const {return mElement.get();}
 
     uint32_t getDimX() const {return mHal.state.dimX;}
@@ -83,16 +82,6 @@
         rsAssert(lod < mHal.state.lodCount);
         return mHal.state.lodDimZ[lod];
     }
-    uint32_t getLODOffset(uint32_t lod) const {
-        rsAssert(lod < mHal.state.lodCount);
-        return mHal.state.lodOffset[lod];
-    }
-    uint32_t getLODOffset(uint32_t lod, uint32_t x) const;
-    uint32_t getLODOffset(uint32_t lod, uint32_t x, uint32_t y) const;
-    uint32_t getLODOffset(uint32_t lod, uint32_t x, uint32_t y, uint32_t z) const;
-
-    uint32_t getLODFaceOffset(uint32_t lod, RsAllocationCubemapFace face,
-                              uint32_t x, uint32_t y) const;
 
     uint32_t getLODCount() const {return mHal.state.lodCount;}
     bool getIsNp2() const;
@@ -137,8 +126,7 @@
 
     // count of mipmap levels, 0 indicates no mipmapping
 
-    size_t mMipChainSizeBytes;
-    size_t mTotalSizeBytes;
+    size_t mCellCount;
 protected:
     virtual void preDestroy() const;
     virtual ~Type();