Expand RS vector3 types to vector4.

BUG=5609007

The underlying LLVM implementation for vector3 types does this implicitly. If
RS does not adjust its implementation, we will always be misaligned for any
subsequent data after a vector3 type. We previously inserted padding into the
reflected layers from llvm-rs-cc (hence the skip padding part of this change).
We can safely ignore the padding now that the Java/native code is updated to
use the expanded size. The compiler will also need modification to ensure that
we don't mistakenly skip over any end-of-struct padding.

Fixing the 3 component vector padding problem.

Change-Id: If68af42287deb8f4b28addcd19a9fa314656be44
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 35d812d..c1192fe 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -195,6 +195,81 @@
          prefix, getPtr(), mHal.state.usageFlags, mHal.state.mipmapControl);
 }
 
+uint32_t Allocation::getPackedSize() const {
+    uint32_t numItems = mHal.state.type->getSizeBytes() / mHal.state.type->getElementSizeBytes();
+    return numItems * mHal.state.type->getElement()->getSizeBytesUnpadded();
+}
+
+void Allocation::writePackedData(const Type *type,
+                                 uint8_t *dst, const uint8_t *src, bool dstPadded) {
+    const Element *elem = type->getElement();
+    uint32_t unpaddedBytes = elem->getSizeBytesUnpadded();
+    uint32_t paddedBytes = elem->getSizeBytes();
+    uint32_t numItems = type->getSizeBytes() / paddedBytes;
+
+    uint32_t srcInc = !dstPadded ? paddedBytes : unpaddedBytes;
+    uint32_t dstInc =  dstPadded ? paddedBytes : unpaddedBytes;
+
+    // no sub-elements
+    uint32_t fieldCount = elem->getFieldCount();
+    if (fieldCount == 0) {
+        for (uint32_t i = 0; i < numItems; i ++) {
+            memcpy(dst, src, unpaddedBytes);
+            src += srcInc;
+            dst += dstInc;
+        }
+        return;
+    }
+
+    // Cache offsets
+    uint32_t *offsetsPadded = new uint32_t[fieldCount];
+    uint32_t *offsetsUnpadded = new uint32_t[fieldCount];
+    uint32_t *sizeUnpadded = new uint32_t[fieldCount];
+
+    for (uint32_t i = 0; i < fieldCount; i++) {
+        offsetsPadded[i] = elem->getFieldOffsetBytes(i);
+        offsetsUnpadded[i] = elem->getFieldOffsetBytesUnpadded(i);
+        sizeUnpadded[i] = elem->getField(i)->getSizeBytesUnpadded();
+    }
+
+    uint32_t *srcOffsets = !dstPadded ? offsetsPadded : offsetsUnpadded;
+    uint32_t *dstOffsets =  dstPadded ? offsetsPadded : offsetsUnpadded;
+
+    // complex elements, need to copy subelem after subelem
+    for (uint32_t i = 0; i < numItems; i ++) {
+        for (uint32_t fI = 0; fI < fieldCount; fI++) {
+            memcpy(dst + dstOffsets[fI], src + srcOffsets[fI], sizeUnpadded[fI]);
+        }
+        src += srcInc;
+        dst += dstInc;
+    }
+
+    delete[] offsetsPadded;
+    delete[] offsetsUnpadded;
+    delete[] sizeUnpadded;
+}
+
+void Allocation::unpackVec3Allocation(const void *data, uint32_t dataSize) {
+    const uint8_t *src = (const uint8_t*)data;
+    uint8_t *dst = (uint8_t*)getPtr();
+
+    writePackedData(getType(), dst, src, true);
+}
+
+void Allocation::packVec3Allocation(OStream *stream) const {
+    uint32_t paddedBytes = getType()->getElement()->getSizeBytes();
+    uint32_t unpaddedBytes = getType()->getElement()->getSizeBytesUnpadded();
+    uint32_t numItems = mHal.state.type->getSizeBytes() / paddedBytes;
+
+    const uint8_t *src = (const uint8_t*)getPtr();
+    uint8_t *dst = new uint8_t[numItems * unpaddedBytes];
+
+    writePackedData(getType(), dst, src, false);
+    stream->addByteArray(dst, getPackedSize());
+
+    delete[] dst;
+}
+
 void Allocation::serialize(OStream *stream) const {
     // Need to identify ourselves
     stream->addU32((uint32_t)getClassId());
@@ -207,10 +282,17 @@
     mHal.state.type->serialize(stream);
 
     uint32_t dataSize = mHal.state.type->getSizeBytes();
+    // 3 element vectors are padded to 4 in memory, but padding isn't serialized
+    uint32_t packedSize = getPackedSize();
     // Write how much data we are storing
-    stream->addU32(dataSize);
-    // Now write the data
-    stream->addByteArray(getPtr(), dataSize);
+    stream->addU32(packedSize);
+    if (dataSize == packedSize) {
+        // Now write the data
+        stream->addByteArray(getPtr(), dataSize);
+    } else {
+        // Now write the data
+        packVec3Allocation(stream);
+    }
 }
 
 Allocation *Allocation::createFromStream(Context *rsc, IStream *stream) {
@@ -230,22 +312,30 @@
     }
     type->compute();
 
+    Allocation *alloc = Allocation::createAllocation(rsc, type, RS_ALLOCATION_USAGE_SCRIPT);
+    type->decUserRef();
+
     // Number of bytes we wrote out for this allocation
     uint32_t dataSize = stream->loadU32();
-    if (dataSize != type->getSizeBytes()) {
+    // 3 element vectors are padded to 4 in memory, but padding isn't serialized
+    uint32_t packedSize = alloc->getPackedSize();
+    if (dataSize != type->getSizeBytes() &&
+        dataSize != packedSize) {
         LOGE("failed to read allocation because numbytes written is not the same loaded type wants\n");
+        ObjectBase::checkDelete(alloc);
         ObjectBase::checkDelete(type);
         return NULL;
     }
 
-    Allocation *alloc = Allocation::createAllocation(rsc, type, RS_ALLOCATION_USAGE_SCRIPT);
     alloc->setName(name.string(), name.size());
-    type->decUserRef();
 
-    uint32_t count = dataSize / type->getElementSizeBytes();
-
-    // Read in all of our allocation data
-    alloc->data(rsc, 0, 0, count, stream->getPtr() + stream->getPos(), dataSize);
+    if (dataSize == type->getSizeBytes()) {
+        uint32_t count = dataSize / type->getElementSizeBytes();
+        // Read in all of our allocation data
+        alloc->data(rsc, 0, 0, count, stream->getPtr() + stream->getPos(), dataSize);
+    } else {
+        alloc->unpackVec3Allocation(stream->getPtr() + stream->getPos(), dataSize);
+    }
     stream->reset(stream->getPos() + dataSize);
 
     return alloc;
diff --git a/rsAllocation.h b/rsAllocation.h
index 714798a..4ce863a 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -135,6 +135,11 @@
 private:
     void freeChildrenUnlocked();
     Allocation(Context *rsc, const Type *, uint32_t usages, RsAllocationMipmapControl mc);
+
+    uint32_t getPackedSize() const;
+    static void writePackedData(const Type *type, uint8_t *dst, const uint8_t *src, bool dstPadded);
+    void unpackVec3Allocation(const void *data, uint32_t dataSize);
+    void packVec3Allocation(OStream *stream) const;
 };
 
 }
diff --git a/rsComponent.cpp b/rsComponent.cpp
index 7d9cf0b..21b98f6 100644
--- a/rsComponent.cpp
+++ b/rsComponent.cpp
@@ -169,7 +169,8 @@
         break;
     }
 
-    mBits = mTypeBits * mVectorSize;
+    mBitsUnpadded = mTypeBits * mVectorSize;
+    mBits = mTypeBits * rsHigherPow2(mVectorSize);
 }
 
 bool Component::isReference() const {
diff --git a/rsComponent.h b/rsComponent.h
index 6ddc990..8629d0d 100644
--- a/rsComponent.h
+++ b/rsComponent.h
@@ -41,6 +41,7 @@
     bool getIsFloat() const {return mIsFloat;}
     bool getIsSigned() const {return mIsSigned;}
     uint32_t getBits() const {return mBits;}
+    uint32_t getBitsUnpadded() const {return mBitsUnpadded;}
 
     // Helpers for reading / writing this class out
     void serialize(OStream *stream) const;
@@ -56,6 +57,7 @@
 
     // derived
     uint32_t mBits;
+    uint32_t mBitsUnpadded;
     uint32_t mTypeBits;
     bool mIsFloat;
     bool mIsSigned;
diff --git a/rsElement.cpp b/rsElement.cpp
index df90ce4..56c31b6 100644
--- a/rsElement.cpp
+++ b/rsElement.cpp
@@ -23,6 +23,7 @@
 
 Element::Element(Context *rsc) : ObjectBase(rsc) {
     mBits = 0;
+    mBitsUnpadded = 0;
     mFields = NULL;
     mFieldCount = 0;
     mHasReference = false;
@@ -60,6 +61,18 @@
     return total;
 }
 
+size_t Element::getSizeBitsUnpadded() const {
+    if (!mFieldCount) {
+        return mBitsUnpadded;
+    }
+
+    size_t total = 0;
+    for (size_t ct=0; ct < mFieldCount; ct++) {
+        total += mFields[ct].e->mBitsUnpadded * mFields[ct].arraySize;
+    }
+    return total;
+}
+
 void Element::dumpLOGV(const char *prefix) const {
     ObjectBase::dumpLOGV(prefix);
     ALOGV("%s Element: fieldCount: %zu,  size bytes: %zu", prefix, mFieldCount, getSizeBytes());
@@ -146,14 +159,18 @@
 void Element::compute() {
     if (mFieldCount == 0) {
         mBits = mComponent.getBits();
+        mBitsUnpadded = mComponent.getBitsUnpadded();
         mHasReference = mComponent.isReference();
         return;
     }
 
     size_t bits = 0;
+    size_t bitsUnpadded = 0;
     for (size_t ct=0; ct < mFieldCount; ct++) {
         mFields[ct].offsetBits = bits;
+        mFields[ct].offsetBitsUnpadded = bitsUnpadded;
         bits += mFields[ct].e->getSizeBits() * mFields[ct].arraySize;
+        bitsUnpadded += mFields[ct].e->getSizeBitsUnpadded() * mFields[ct].arraySize;
 
         if (mFields[ct].e->mHasReference) {
             mHasReference = true;
diff --git a/rsElement.h b/rsElement.h
index bfdec53..04010fa 100644
--- a/rsElement.h
+++ b/rsElement.h
@@ -43,6 +43,11 @@
     uint32_t getGLType() const;
     uint32_t getGLFormat() const;
 
+    size_t getSizeBitsUnpadded() const;
+    size_t getSizeBytesUnpadded() const {
+        return (getSizeBitsUnpadded() + 7) >> 3;
+    }
+
     size_t getSizeBits() const;
     size_t getSizeBytes() const {
         return (getSizeBits() + 7) >> 3;
@@ -55,6 +60,10 @@
         return mFields[componentNumber].offsetBits >> 3;
     }
 
+    size_t getFieldOffsetBytesUnpadded(uint32_t componentNumber) const {
+        return mFields[componentNumber].offsetBitsUnpadded >> 3;
+    }
+
     uint32_t getFieldCount() const {return mFieldCount;}
     const Element * getField(uint32_t idx) const {return mFields[idx].e.get();}
     const char * getFieldName(uint32_t idx) const {return mFields[idx].name.string();}
@@ -64,6 +73,7 @@
     RsDataType getType() const {return mComponent.getType();}
     RsDataKind getKind() const {return mComponent.getKind();}
     uint32_t getBits() const {return mBits;}
+    uint32_t getBitsUnpadded() const {return mBitsUnpadded;}
 
     void dumpLOGV(const char *prefix) const;
     virtual void serialize(OStream *stream) const;
@@ -112,6 +122,7 @@
         String8 name;
         ObjectBaseRef<const Element> e;
         uint32_t offsetBits;
+        uint32_t offsetBitsUnpadded;
         uint32_t arraySize;
     } ElementField_t;
     ElementField_t *mFields;
@@ -123,6 +134,7 @@
     Element(Context *);
 
     Component mComponent;
+    uint32_t mBitsUnpadded;
     uint32_t mBits;
 
     void compute();
diff --git a/rsFont.cpp b/rsFont.cpp
index 7efed9d..d1b7324 100644
--- a/rsFont.cpp
+++ b/rsFont.cpp
@@ -651,7 +651,7 @@
                                float x4, float y4, float z4,
                                float u4, float v4) {
     const uint32_t vertsPerQuad = 4;
-    const uint32_t floatsPerVert = 5;
+    const uint32_t floatsPerVert = 6;
     float *currentPos = mTextMeshPtr + mCurrentQuadIndex * vertsPerQuad * floatsPerVert;
 
     // Cull things that are off the screen
@@ -670,24 +670,28 @@
     (*currentPos++) = x1;
     (*currentPos++) = y1;
     (*currentPos++) = z1;
+    (*currentPos++) = 0;
     (*currentPos++) = u1;
     (*currentPos++) = v1;
 
     (*currentPos++) = x2;
     (*currentPos++) = y2;
     (*currentPos++) = z2;
+    (*currentPos++) = 0;
     (*currentPos++) = u2;
     (*currentPos++) = v2;
 
     (*currentPos++) = x3;
     (*currentPos++) = y3;
     (*currentPos++) = z3;
+    (*currentPos++) = 0;
     (*currentPos++) = u3;
     (*currentPos++) = v3;
 
     (*currentPos++) = x4;
     (*currentPos++) = y4;
     (*currentPos++) = z4;
+    (*currentPos++) = 0;
     (*currentPos++) = u4;
     (*currentPos++) = v4;