Generalize bitmap support and add remaining GL formats.
Fix bug in command fifo looping case.
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 7b8bc80..3a01a75 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -47,6 +47,7 @@
 
 Allocation::~Allocation()
 {
+    LOGE("Allocation %p destryed", this);
 }
 
 void Allocation::setCpuWritable(bool)
@@ -77,6 +78,13 @@
 
     //LOGE("uploadToTexture  %i,  lod %i", mTextureID, lodOffset);
 
+    GLenum type = mType->getElement()->getGLType();
+    GLenum format = mType->getElement()->getGLFormat();
+
+    if (!type || !format) {
+        return;
+    }
+
     if (!mTextureID) {
         glGenTextures(1, &mTextureID);
     }
@@ -87,9 +95,9 @@
         adapt.setLOD(lod+lodOffset);
 
         uint16_t * ptr = static_cast<uint16_t *>(adapt.getElement(0,0));
-        glTexImage2D(GL_TEXTURE_2D, lod, GL_RGB, 
-                     adapt.getDimX(), adapt.getDimY(), 
-                     0, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, ptr);
+        glTexImage2D(GL_TEXTURE_2D, lod, format,
+                     adapt.getDimX(), adapt.getDimY(),
+                     0, format, type, ptr);
     }
 }
 
@@ -121,7 +129,7 @@
     memcpy(ptr, data, count * eSize);
 }
 
-void Allocation::subData(uint32_t xoff, uint32_t yoff, 
+void Allocation::subData(uint32_t xoff, uint32_t yoff,
              uint32_t w, uint32_t h, const void *data)
 {
     uint32_t eSize = mType->getElementSizeBytes();
@@ -147,7 +155,7 @@
 
 
 /////////////////
-// 
+//
 
 
 namespace android {
@@ -192,7 +200,7 @@
 {
 }
 
-static void mip(const Adapter2D &out, const Adapter2D &in)
+static void mip565(const Adapter2D &out, const Adapter2D &in)
 {
     uint32_t w = out.getDimX();
     uint32_t h = out.getDimY();
@@ -203,7 +211,26 @@
         const uint16_t *i2 = static_cast<uint16_t *>(in.getElement(0, y*2+1));
 
         for (uint32_t x=0; x < h; x++) {
-            *oPtr = rsBoxFilter565(i1[0], i1[2], i2[0], i2[1]);
+            *oPtr = rsBoxFilter565(i1[0], i1[1], i2[0], i2[1]);
+            oPtr ++;
+            i1 += 2;
+            i2 += 2;
+        }
+    }
+}
+
+static void mip8888(const Adapter2D &out, const Adapter2D &in)
+{
+    uint32_t w = out.getDimX();
+    uint32_t h = out.getDimY();
+
+    for (uint32_t y=0; y < w; y++) {
+        uint32_t *oPtr = static_cast<uint32_t *>(out.getElement(0, y));
+        const uint32_t *i1 = static_cast<uint32_t *>(in.getElement(0, y*2));
+        const uint32_t *i2 = static_cast<uint32_t *>(in.getElement(0, y*2+1));
+
+        for (uint32_t x=0; x < h; x++) {
+            *oPtr = rsBoxFilter8888(i1[0], i1[1], i2[0], i2[1]);
             oPtr ++;
             i1 += 2;
             i2 += 2;
@@ -255,21 +282,25 @@
 
 static ElementConverter_t pickConverter(RsElementPredefined dstFmt, RsElementPredefined srcFmt)
 {
-    if ((dstFmt == RS_ELEMENT_RGB_565) && 
+    if ((dstFmt == RS_ELEMENT_RGB_565) &&
         (srcFmt == RS_ELEMENT_RGB_565)) {
         return elementConverter_cpy_16;
     }
 
-    if ((dstFmt == RS_ELEMENT_RGB_565) && 
+    if ((dstFmt == RS_ELEMENT_RGB_565) &&
         (srcFmt == RS_ELEMENT_RGB_888)) {
         return elementConverter_888_to_565;
     }
 
-    if ((dstFmt == RS_ELEMENT_RGB_565) && 
+    if ((dstFmt == RS_ELEMENT_RGB_565) &&
         (srcFmt == RS_ELEMENT_RGBA_8888)) {
         return elementConverter_8888_to_565;
     }
 
+    if ((dstFmt == RS_ELEMENT_RGBA_8888) &&
+        (srcFmt == RS_ELEMENT_RGBA_8888)) {
+        return elementConverter_cpy_32;
+    }
 
     LOGE("pickConverter, unsuported combo");
     return 0;
@@ -303,7 +334,7 @@
         for(uint32_t lod=0; lod < (texAlloc->getType()->getLODCount() -1); lod++) {
             adapt.setLOD(lod);
             adapt2.setLOD(lod + 1);
-            mip(adapt2, adapt);
+            mip565(adapt2, adapt);
         }
     }
 
@@ -312,6 +343,8 @@
 
 RsAllocation rsi_AllocationCreateFromFile(Context *rsc, const char *file, bool genMips)
 {
+    bool use32bpp = false;
+
     typedef struct _Win3xBitmapHeader
     {
        uint16_t type;
@@ -351,7 +384,11 @@
     int32_t texWidth = rsHigherPow2(hdr.width);
     int32_t texHeight = rsHigherPow2(hdr.height);
 
-    rsi_TypeBegin(rsc, rsi_ElementGetPredefined(rsc, RS_ELEMENT_RGB_565));
+    if (use32bpp) {
+        rsi_TypeBegin(rsc, rsi_ElementGetPredefined(rsc, RS_ELEMENT_RGBA_8888));
+    } else {
+        rsi_TypeBegin(rsc, rsi_ElementGetPredefined(rsc, RS_ELEMENT_RGB_565));
+    }
     rsi_TypeAdd(rsc, RS_DIMENSION_X, texWidth);
     rsi_TypeAdd(rsc, RS_DIMENSION_Y, texHeight);
     if (genMips) {
@@ -372,14 +409,29 @@
     Adapter2D adapt(texAlloc);
     uint8_t * fileInBuf = new uint8_t[texWidth * 3];
     uint32_t yOffset = (hdr.width - hdr.height) / 2;
-    uint16_t *tmp = static_cast<uint16_t *>(adapt.getElement(0, yOffset));
 
-    for (int y=0; y < hdr.height; y++) {
-        fseek(f, hdr.offset + (y*hdr.width*3), SEEK_SET);
-        fread(fileInBuf, 1, hdr.width * 3, f);
-        for(int x=0; x < hdr.width; x++) {
-            *tmp = rs888to565(fileInBuf[x*3], fileInBuf[x*3 + 1], fileInBuf[x*3 + 2]);
-            tmp++;
+    if (use32bpp) {
+        uint8_t *tmp = static_cast<uint8_t *>(adapt.getElement(0, yOffset));
+        for (int y=0; y < hdr.height; y++) {
+            fseek(f, hdr.offset + (y*hdr.width*3), SEEK_SET);
+            fread(fileInBuf, 1, hdr.width * 3, f);
+            for(int x=0; x < hdr.width; x++) {
+                tmp[0] = fileInBuf[x*3 + 2];
+                tmp[1] = fileInBuf[x*3 + 1];
+                tmp[2] = fileInBuf[x*3];
+                tmp[3] = 0xff;
+                tmp += 4;
+            }
+        }
+    } else {
+        uint16_t *tmp = static_cast<uint16_t *>(adapt.getElement(0, yOffset));
+        for (int y=0; y < hdr.height; y++) {
+            fseek(f, hdr.offset + (y*hdr.width*3), SEEK_SET);
+            fread(fileInBuf, 1, hdr.width * 3, f);
+            for(int x=0; x < hdr.width; x++) {
+                *tmp = rs888to565(fileInBuf[x*3 + 2], fileInBuf[x*3 + 1], fileInBuf[x*3]);
+                tmp++;
+            }
         }
     }
 
@@ -391,7 +443,11 @@
         for(uint32_t lod=0; lod < (texAlloc->getType()->getLODCount() -1); lod++) {
             adapt.setLOD(lod);
             adapt2.setLOD(lod + 1);
-            mip(adapt2, adapt);
+            if (use32bpp) {
+                mip8888(adapt2, adapt);
+            } else {
+                mip565(adapt2, adapt);
+            }
         }
     }
 
diff --git a/rsElement.cpp b/rsElement.cpp
index bd11f72..5a44f47 100644
--- a/rsElement.cpp
+++ b/rsElement.cpp
@@ -16,6 +16,8 @@
 
 #include "rsContext.h"
 
+#include <GLES/gl.h>
+
 using namespace android;
 using namespace android::renderscript;
 
@@ -235,6 +237,108 @@
     return offset;
 }
 
+uint32_t Element::getGLType() const
+{
+    int bits[4];
+
+    if (mComponentCount > 4) {
+        return 0;
+    }
+
+    for (uint32_t ct=0; ct < mComponentCount; ct++) {
+        bits[ct] = mComponents[ct]->getBits();
+        if (mComponents[ct]->getType() != Component::UNSIGNED) {
+            return 0;
+        }
+        if (!mComponents[ct]->getIsNormalized()) {
+            return 0;
+        }
+    }
+
+    switch(mComponentCount) {
+    case 1:
+        if (bits[0] == 8) {
+            return GL_UNSIGNED_BYTE;
+        }
+        return 0;
+    case 2:
+        if ((bits[0] == 8) &&
+            (bits[1] == 8)) {
+            return GL_UNSIGNED_BYTE;
+        }
+        return 0;
+    case 3:
+        if ((bits[0] == 8) &&
+            (bits[1] == 8) &&
+            (bits[2] == 8)) {
+            return GL_UNSIGNED_BYTE;
+        }
+        if ((bits[0] == 5) &&
+            (bits[1] == 6) &&
+            (bits[2] == 5)) {
+            return GL_UNSIGNED_SHORT_5_6_5;
+        }
+        return 0;
+    case 4:
+        if ((bits[0] == 8) &&
+            (bits[1] == 8) &&
+            (bits[2] == 8) &&
+            (bits[3] == 8)) {
+            return GL_UNSIGNED_BYTE;
+        }
+        if ((bits[0] == 4) &&
+            (bits[1] == 4) &&
+            (bits[2] == 4) &&
+            (bits[3] == 4)) {
+            return GL_UNSIGNED_SHORT_4_4_4_4;
+        }
+        if ((bits[0] == 5) &&
+            (bits[1] == 5) &&
+            (bits[2] == 5) &&
+            (bits[3] == 1)) {
+            return GL_UNSIGNED_SHORT_5_5_5_1;
+        }
+    }
+    return 0;
+}
+
+uint32_t Element::getGLFormat() const
+{
+    switch(mComponentCount) {
+    case 1:
+        if (mComponents[0]->getKind() == Component::ALPHA) {
+            return GL_ALPHA;
+        }
+        if (mComponents[0]->getKind() == Component::LUMINANCE) {
+            return GL_LUMINANCE;
+        }
+        break;
+    case 2:
+        if ((mComponents[0]->getKind() == Component::LUMINANCE) &&
+            (mComponents[1]->getKind() == Component::ALPHA)) {
+            return GL_LUMINANCE_ALPHA;
+        }
+        break;
+    case 3:
+        if ((mComponents[0]->getKind() == Component::RED) &&
+            (mComponents[1]->getKind() == Component::GREEN) &&
+            (mComponents[2]->getKind() == Component::BLUE)) {
+            return GL_RGB;
+        }
+        break;
+    case 4:
+        if ((mComponents[0]->getKind() == Component::RED) &&
+            (mComponents[1]->getKind() == Component::GREEN) &&
+            (mComponents[2]->getKind() == Component::BLUE) &&
+            (mComponents[3]->getKind() == Component::ALPHA)) {
+            return GL_RGBA;
+        }
+        break;
+    }
+    return 0;
+}
+
+
 ElementState::ElementState()
 {
 }
diff --git a/rsElement.h b/rsElement.h
index 7852ffc..2434977 100644
--- a/rsElement.h
+++ b/rsElement.h
@@ -36,6 +36,8 @@
 
     void setComponent(uint32_t idx, Component *c);
 
+    uint32_t getGLType() const;
+    uint32_t getGLFormat() const;
 
 
     size_t getSizeBits() const;
diff --git a/rsLocklessFifo.cpp b/rsLocklessFifo.cpp
index 3f51e04..67ab434 100644
--- a/rsLocklessFifo.cpp
+++ b/rsLocklessFifo.cpp
@@ -74,6 +74,7 @@
         freeSpace = 0;
     }
     
+    //LOGE("free %i", freeSpace);
     return freeSpace;
 }
 
@@ -85,8 +86,8 @@
 
 void * LocklessCommandFifo::reserve(uint32_t sizeInBytes)
 {
-    // Add space for command header;
-    sizeInBytes += 4;
+    // Add space for command header and loop token;
+    sizeInBytes += 8;
 
     //dumpState("reserve");
     if (getFreeSpace() < sizeInBytes) {
@@ -153,16 +154,17 @@
 
 void LocklessCommandFifo::makeSpace(uint32_t bytes)
 {
+    //dumpState("make space");
     if ((mPut+bytes) > mEnd) {
         // Need to loop regardless of where get is.
-        while((mGet > mPut) && (mPut+4 >= mGet)) {
+        while((mGet > mPut) && (mBuffer+4 >= mGet)) {
             sleep(1);
         }
 
         // Toss in a reset then the normal wait for space will do the rest.
         reinterpret_cast<uint16_t *>(mPut)[0] = 0;
         reinterpret_cast<uint16_t *>(mPut)[1] = 0;
-        mPut += 4;
+        mPut = mBuffer;
     }
 
     // it will fit here so we just need to wait for space.
diff --git a/rsUtils.h b/rsUtils.h
index f40e2ce..5a43fb3 100644
--- a/rsUtils.h
+++ b/rsUtils.h
@@ -96,13 +96,19 @@
 static inline uint16_t rsBoxFilter565(uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4)
 {
     uint32_t r = ((i1 & 0x1f) + (i2 & 0x1f) + (i3 & 0x1f) + (i4 & 0x1f));
-    uint32_t g = ((i1 >> 5) & 0x3f) + ((i2 >> 5) & 0x3f) + ((i3 >> 5) & 0x3f) + ((i1 >> 5) & 0x3f);
+    uint32_t g = ((i1 >> 5) & 0x3f) + ((i2 >> 5) & 0x3f) + ((i3 >> 5) & 0x3f) + ((i4 >> 5) & 0x3f);
     uint32_t b = ((i1 >> 11) + (i2 >> 11) + (i3 >> 11) + (i4 >> 11));
     return (r >> 2) | ((g >> 2) << 5) | ((b >> 2) << 11);
 }
 
-
-
+static inline uint32_t rsBoxFilter8888(uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4)
+{
+    uint32_t r = (i1 & 0xff) +         (i2 & 0xff) +         (i3 & 0xff) +         (i4 & 0xff);
+    uint32_t g = ((i1 >> 8) & 0xff) +  ((i2 >> 8) & 0xff) +  ((i3 >> 8) & 0xff) +  ((i4 >> 8) & 0xff);
+    uint32_t b = ((i1 >> 16) & 0xff) + ((i2 >> 16) & 0xff) + ((i3 >> 16) & 0xff) + ((i4 >> 16) & 0xff);
+    uint32_t a = ((i1 >> 24) & 0xff) + ((i2 >> 24) & 0xff) + ((i3 >> 24) & 0xff) + ((i4 >> 24) & 0xff);
+    return (r >> 2) | ((g >> 2) << 8) | ((b >> 2) << 16) | ((a >> 2) << 24);
+}