Remove pad word from arrays

This change removes the 4 byte pad from all arrays except longs and
doubles. It saves 76kb from the boot image, and will also reduce the
size of arrays in the heap (and thereby reduce garbage collection).

Change-Id: I3ff277d5bf14c57c0f7552215818e588ec6cc275
diff --git a/src/check_jni.cc b/src/check_jni.cc
index 84c7376..6dc47f7 100644
--- a/src/check_jni.cc
+++ b/src/check_jni.cc
@@ -1110,8 +1110,9 @@
   ScopedJniThreadState ts(env);
 
   Array* a = Decode<Array*>(ts, java_array);
-  size_t byte_count = a->GetLength() * a->GetClass()->GetComponentSize();
-  void* result = GuardedCopy::Create(a->GetRawData(), byte_count, true);
+  size_t component_size = a->GetClass()->GetComponentSize();
+  size_t byte_count = a->GetLength() * component_size;
+  void* result = GuardedCopy::Create(a->GetRawData(component_size), byte_count, true);
   if (isCopy != NULL) {
     *isCopy = JNI_TRUE;
   }
@@ -1134,7 +1135,7 @@
 
   if (mode != JNI_ABORT) {
     size_t len = GuardedCopy::FromData(dataBuf)->original_length;
-    memcpy(a->GetRawData(), dataBuf, len);
+    memcpy(a->GetRawData(a->GetClass()->GetComponentSize()), dataBuf, len);
   }
   if (mode != JNI_COMMIT) {
     GuardedCopy::Destroy(dataBuf);
diff --git a/src/class_linker_test.cc b/src/class_linker_test.cc
index 1f7f24d..8c97cf1 100644
--- a/src/class_linker_test.cc
+++ b/src/class_linker_test.cc
@@ -748,23 +748,48 @@
   AssertDexFile(java_lang_dex_file_, NULL);
 }
 
-// The first reference array element must be a multiple of 8 bytes from the
+// The first reference array element must be a multiple of 4 bytes from the
 // start of the object
 TEST_F(ClassLinkerTest, ValidateObjectArrayElementsOffset) {
   Class* array_class = class_linker_->FindSystemClass("[Ljava/lang/String;");
   ObjectArray<String>* array = ObjectArray<String>::Alloc(array_class, 0);
   uint32_t array_offset = reinterpret_cast<uint32_t>(array);
   uint32_t data_offset =
-      array_offset + ObjectArray<String>::DataOffset().Uint32Value();
-  EXPECT_EQ(16U, data_offset - array_offset);
+      array_offset + ObjectArray<String>::DataOffset(sizeof(String*)).Uint32Value();
+  if (sizeof(String*) == sizeof(int32_t)) {
+    EXPECT_TRUE(IsAligned<4>(data_offset));  // Check 4 byte alignment.
+  } else {
+    EXPECT_TRUE(IsAligned<8>(data_offset));  // Check 8 byte alignment.
+  }
 }
 
 TEST_F(ClassLinkerTest, ValidatePrimitiveArrayElementsOffset) {
-  SirtRef<LongArray> array(LongArray::Alloc(0));
-  EXPECT_EQ(class_linker_->FindSystemClass("[J"), array->GetClass());
-  uint32_t array_offset = reinterpret_cast<uint32_t>(array.get());
-  uint32_t data_offset = reinterpret_cast<uint32_t>(array->GetData());
-  EXPECT_EQ(16U, data_offset - array_offset);
+  SirtRef<LongArray> long_array(LongArray::Alloc(0));
+  EXPECT_EQ(class_linker_->FindSystemClass("[J"), long_array->GetClass());
+  uintptr_t data_offset = reinterpret_cast<uintptr_t>(long_array->GetData());
+  EXPECT_TRUE(IsAligned<8>(data_offset));  // Longs require 8 byte alignment
+
+  SirtRef<DoubleArray> double_array(DoubleArray::Alloc(0));
+  EXPECT_EQ(class_linker_->FindSystemClass("[D"), double_array->GetClass());
+  data_offset = reinterpret_cast<uintptr_t>(double_array->GetData());
+  EXPECT_TRUE(IsAligned<8>(data_offset));  // Doubles require 8 byte alignment
+
+  SirtRef<IntArray> int_array(IntArray::Alloc(0));
+  EXPECT_EQ(class_linker_->FindSystemClass("[I"), int_array->GetClass());
+  data_offset = reinterpret_cast<uintptr_t>(int_array->GetData());
+  EXPECT_TRUE(IsAligned<4>(data_offset));  // Ints require 4 byte alignment
+
+  SirtRef<CharArray> char_array(CharArray::Alloc(0));
+  EXPECT_EQ(class_linker_->FindSystemClass("[C"), char_array->GetClass());
+  data_offset = reinterpret_cast<uintptr_t>(char_array->GetData());
+  EXPECT_TRUE(IsAligned<2>(data_offset));  // Chars require 2 byte alignment
+
+  SirtRef<ShortArray> short_array(ShortArray::Alloc(0));
+  EXPECT_EQ(class_linker_->FindSystemClass("[S"), short_array->GetClass());
+  data_offset = reinterpret_cast<uintptr_t>(short_array->GetData());
+  EXPECT_TRUE(IsAligned<2>(data_offset));  // Shorts require 2 byte alignment
+
+  // Take it as given that bytes and booleans have byte alignment
 }
 
 TEST_F(ClassLinkerTest, ValidateBoxedTypes) {
diff --git a/src/compiler/codegen/arm/ArchFactory.cc b/src/compiler/codegen/arm/ArchFactory.cc
index 79a62ad..bdadf6e 100644
--- a/src/compiler/codegen/arm/ArchFactory.cc
+++ b/src/compiler/codegen/arm/ArchFactory.cc
@@ -229,6 +229,10 @@
      */
     oatLockTemp(cUnit, r0);
 
+    // TODO: use the correct component size, currently all supported types share array alignment
+    // with ints (see comment at head of function)
+    size_t component_size = sizeof(int32_t);
+
     // Having a range of 0 is legal
     if (isRange && (dInsn->vA > 0)) {
         /*
@@ -262,7 +266,7 @@
                     oatSRegOffset(cUnit, rlFirst.sRegLow));
         // Set up the target pointer
         opRegRegImm(cUnit, kOpAdd, rDst, r0,
-                    Array::DataOffset().Int32Value());
+                    Array::DataOffset(component_size).Int32Value());
         // Set up the loop counter (known to be > 0)
         loadConstant(cUnit, rIdx, dInsn->vA - 1);
         // Generate the copy loop.  Going backwards for convenience
@@ -281,7 +285,7 @@
             RegLocation rlArg = loadValue(cUnit,
                 oatGetSrc(cUnit, mir, i), kCoreReg);
             storeBaseDisp(cUnit, r0,
-                          Array::DataOffset().Int32Value() +
+                          Array::DataOffset(component_size).Int32Value() +
                           i * 4, rlArg.lowReg, kWord);
             // If the loadValue caused a temp to be allocated, free it
             if (oatIsTemp(cUnit, rlArg.lowReg)) {
@@ -330,7 +334,7 @@
                 Method::DexCacheInitializedStaticStorageOffset().Int32Value(),
                 rBase);
             loadWordDisp(cUnit, rBase,
-                         Array::DataOffset().Int32Value() + sizeof(int32_t*) *
+                         Array::DataOffset(sizeof(Object*)).Int32Value() + sizeof(int32_t*) *
                          ssbIndex, rBase);
             // rBase now points at appropriate static storage base (Class*)
             // or NULL if not initialized. Check for NULL and call helper if NULL.
@@ -424,7 +428,8 @@
                 Method::DexCacheInitializedStaticStorageOffset().Int32Value(),
                 rBase);
             loadWordDisp(cUnit, rBase,
-                         Array::DataOffset().Int32Value() + sizeof(int32_t*) * ssbIndex,
+                         Array::DataOffset(sizeof(Object*)).Int32Value() +
+                         sizeof(int32_t*) * ssbIndex,
                          rBase);
             // rBase now points at appropriate static storage base (Class*)
             // or NULL if not initialized. Check for NULL and call helper if NULL.
@@ -497,7 +502,8 @@
             break;
         case 2:  // Grab target method*
             loadWordDisp(cUnit, r0,
-                Array::DataOffset().Int32Value() + dexIdx * 4, r0);
+                Array::DataOffset(sizeof(Object*)).Int32Value() + dexIdx * 4,
+                r0);
             break;
         case 3:  // Grab the code from the method*
             loadWordDisp(cUnit, r0, Method::GetCodeOffset().Int32Value(), rLR);
@@ -538,7 +544,7 @@
             break;
         case 3: // Get target method [use rLR, set r0]
             loadWordDisp(cUnit, rLR, (methodIdx * 4) +
-                         Array::DataOffset().Int32Value(), r0);
+                         Array::DataOffset(sizeof(Object*)).Int32Value(), r0);
             break;
         case 4: // Get the target compiled code address [uses r0, sets rLR]
             loadWordDisp(cUnit, r0, Method::GetCodeOffset().Int32Value(), rLR);
@@ -584,7 +590,7 @@
             break;
         case 3: // Get target method [use rLR, set r0]
             loadWordDisp(cUnit, rLR, (methodIdx * 4) +
-                         Array::DataOffset().Int32Value(), r0);
+                         Array::DataOffset(sizeof(Object*)).Int32Value(), r0);
             break;
         case 4: // Get the target compiled code address [uses r0, sets rLR]
             loadWordDisp(cUnit, r0, Method::GetCodeOffset().Int32Value(), rLR);
diff --git a/src/compiler/codegen/arm/Thumb2/Gen.cc b/src/compiler/codegen/arm/Thumb2/Gen.cc
index c385f35..5a9750a 100644
--- a/src/compiler/codegen/arm/Thumb2/Gen.cc
+++ b/src/compiler/codegen/arm/Thumb2/Gen.cc
@@ -510,7 +510,8 @@
         // We're don't need access checks, load type from dex cache
         int32_t dex_cache_offset = Method::DexCacheResolvedTypesOffset().Int32Value();
         loadWordDisp(cUnit, mReg, dex_cache_offset, resReg);
-        int32_t offset_of_type = Array::DataOffset().Int32Value() + (sizeof(Class*) * type_idx);
+        int32_t offset_of_type = Array::DataOffset(sizeof(Class*)).Int32Value() +
+                                 (sizeof(Class*) * type_idx);
         loadWordDisp(cUnit, resReg, offset_of_type, rlResult.lowReg);
         if (!cUnit->compiler->CanAssumeTypeIsPresentInDexCache(cUnit->dex_cache,
                                                                type_idx) ||
@@ -571,7 +572,8 @@
 {
     /* NOTE: Most strings should be available at compile time */
     uint32_t string_idx = mir->dalvikInsn.vB;
-    int32_t offset_of_string = Array::DataOffset().Int32Value() + (sizeof(String*) * string_idx);
+    int32_t offset_of_string = Array::DataOffset(sizeof(String*)).Int32Value() +
+                               (sizeof(String*) * string_idx);
     if (!cUnit->compiler->CanAssumeStringIsPresentInDexCache(cUnit->dex_cache, string_idx) ||
         SLOW_STRING_PATH) {
         // slow path, resolve string if not in dex cache
@@ -664,7 +666,8 @@
         // Load dex cache entry into classReg (r2)
         loadValueDirectFixed(cUnit, rlSrc, r0);  // r0 <= ref
         loadWordDisp(cUnit, r1, Method::DexCacheResolvedTypesOffset().Int32Value(), classReg);
-        int32_t offset_of_type = Array::DataOffset().Int32Value() + (sizeof(Class*) * type_idx);
+        int32_t offset_of_type = Array::DataOffset(sizeof(Class*)).Int32Value() +
+                                 (sizeof(Class*) * type_idx);
         loadWordDisp(cUnit, classReg, offset_of_type, classReg);
         if (!cUnit->compiler->CanAssumeTypeIsPresentInDexCache(cUnit->dex_cache, type_idx)) {
             // Need to test presence of type in dex cache at runtime
@@ -728,7 +731,8 @@
     } else {
         // Load dex cache entry into classReg (r2)
         loadWordDisp(cUnit, r1, Method::DexCacheResolvedTypesOffset().Int32Value(), classReg);
-        int32_t offset_of_type = Array::DataOffset().Int32Value() + (sizeof(Class*) * type_idx);
+        int32_t offset_of_type = Array::DataOffset(sizeof(Class*)).Int32Value() +
+                                 (sizeof(Class*) * type_idx);
         loadWordDisp(cUnit, classReg, offset_of_type, classReg);
         if (!cUnit->compiler->CanAssumeTypeIsPresentInDexCache(cUnit->dex_cache, type_idx)) {
             // Need to test presence of type in dex cache at runtime
@@ -1253,7 +1257,7 @@
 {
     RegisterClass regClass = oatRegClassBySize(kWord);
     int lenOffset = Array::LengthOffset().Int32Value();
-    int dataOffset = Array::DataOffset().Int32Value();
+    int dataOffset = Array::DataOffset(sizeof(Object*)).Int32Value();
 
     oatFlushAllRegs(cUnit);
     /* Make sure it's a legal object Put. Use direct regs at first */
@@ -1313,12 +1317,18 @@
 {
     RegisterClass regClass = oatRegClassBySize(size);
     int lenOffset = Array::LengthOffset().Int32Value();
-    int dataOffset = Array::DataOffset().Int32Value();
+    int dataOffset;
     RegLocation rlResult;
     rlArray = loadValue(cUnit, rlArray, kCoreReg);
     rlIndex = loadValue(cUnit, rlIndex, kCoreReg);
     int regPtr;
 
+    if (size == kLong || size == kDouble) {
+      dataOffset = Array::DataOffset(sizeof(int64_t)).Int32Value();
+    } else {
+      dataOffset = Array::DataOffset(sizeof(int32_t)).Int32Value();
+    }
+
     /* null object? */
     genNullCheck(cUnit, rlArray.sRegLow, rlArray.lowReg, mir);
 
@@ -1375,7 +1385,13 @@
 {
     RegisterClass regClass = oatRegClassBySize(size);
     int lenOffset = Array::LengthOffset().Int32Value();
-    int dataOffset = Array::DataOffset().Int32Value();
+    int dataOffset;
+
+    if (size == kLong || size == kDouble) {
+      dataOffset = Array::DataOffset(sizeof(int64_t)).Int32Value();
+    } else {
+      dataOffset = Array::DataOffset(sizeof(int32_t)).Int32Value();
+    }
 
     int regPtr;
     rlArray = loadValue(cUnit, rlArray, kCoreReg);
diff --git a/src/dalvik_system_VMRuntime.cc b/src/dalvik_system_VMRuntime.cc
index df12658..c8a3a4c 100644
--- a/src/dalvik_system_VMRuntime.cc
+++ b/src/dalvik_system_VMRuntime.cc
@@ -77,6 +77,9 @@
 }
 
 jlong VMRuntime_addressOf(JNIEnv* env, jobject, jobject javaArray) {
+  if (javaArray == NULL) {  // Most likely allocation failed
+    return 0;
+  }
   ScopedThreadStateChange tsc(Thread::Current(), Thread::kRunnable);
   Array* array = Decode<Array*>(env, javaArray);
   if (!array->IsArrayInstance()) {
@@ -84,7 +87,7 @@
     return 0;
   }
   // TODO: we should also check that this is a non-movable array.
-  return reinterpret_cast<uintptr_t>(array->GetRawData());
+  return reinterpret_cast<uintptr_t>(array->GetRawData(array->GetClass()->GetComponentSize()));
 }
 
 void VMRuntime_clearGrowthLimit(JNIEnv*, jobject) {
diff --git a/src/debugger.cc b/src/debugger.cc
index 89cfe64..d2372e3 100644
--- a/src/debugger.cc
+++ b/src/debugger.cc
@@ -786,18 +786,18 @@
 
   if (IsPrimitiveTag(tag)) {
     size_t width = GetTagWidth(tag);
-    const uint8_t* src = reinterpret_cast<uint8_t*>(a->GetRawData());
     uint8_t* dst = expandBufAddSpace(pReply, count * width);
     if (width == 8) {
-      const uint64_t* src8 = reinterpret_cast<const uint64_t*>(src);
+      const uint64_t* src8 = reinterpret_cast<uint64_t*>(a->GetRawData(sizeof(uint64_t)));
       for (int i = 0; i < count; ++i) JDWP::Write8BE(&dst, src8[offset + i]);
     } else if (width == 4) {
-      const uint32_t* src4 = reinterpret_cast<const uint32_t*>(src);
+      const uint32_t* src4 = reinterpret_cast<uint32_t*>(a->GetRawData(sizeof(uint32_t)));
       for (int i = 0; i < count; ++i) JDWP::Write4BE(&dst, src4[offset + i]);
     } else if (width == 2) {
-      const uint16_t* src2 = reinterpret_cast<const uint16_t*>(src);
+      const uint16_t* src2 = reinterpret_cast<uint16_t*>(a->GetRawData(sizeof(uint16_t)));
       for (int i = 0; i < count; ++i) JDWP::Write2BE(&dst, src2[offset + i]);
     } else {
+      const uint8_t* src = reinterpret_cast<uint8_t*>(a->GetRawData(sizeof(uint8_t)));
       memcpy(dst, &src[offset * width], count * width);
     }
   } else {
@@ -829,8 +829,8 @@
 
   if (IsPrimitiveTag(tag)) {
     size_t width = GetTagWidth(tag);
-    uint8_t* dst = &(reinterpret_cast<uint8_t*>(a->GetRawData())[offset * width]);
     if (width == 8) {
+      uint8_t* dst = &(reinterpret_cast<uint8_t*>(a->GetRawData(sizeof(uint64_t)))[offset * width]);
       for (int i = 0; i < count; ++i) {
         // Handle potentially non-aligned memory access one byte at a time for ARM's benefit.
         uint64_t value;
@@ -839,12 +839,15 @@
         JDWP::Write8BE(&dst, value);
       }
     } else if (width == 4) {
+      uint8_t* dst = &(reinterpret_cast<uint8_t*>(a->GetRawData(sizeof(uint32_t)))[offset * width]);
       const uint32_t* src4 = reinterpret_cast<const uint32_t*>(src);
       for (int i = 0; i < count; ++i) JDWP::Write4BE(&dst, src4[i]);
     } else if (width == 2) {
+      uint8_t* dst = &(reinterpret_cast<uint8_t*>(a->GetRawData(sizeof(uint16_t)))[offset * width]);
       const uint16_t* src2 = reinterpret_cast<const uint16_t*>(src);
       for (int i = 0; i < count; ++i) JDWP::Write2BE(&dst, src2[i]);
     } else {
+      uint8_t* dst = &(reinterpret_cast<uint8_t*>(a->GetRawData(sizeof(uint8_t)))[offset * width]);
       memcpy(&dst[offset * width], src, count * width);
     }
   } else {
diff --git a/src/dex_cache.h b/src/dex_cache.h
index 2aa2a6a..2560638 100644
--- a/src/dex_cache.h
+++ b/src/dex_cache.h
@@ -47,17 +47,17 @@
   }
 
   static MemberOffset StringsOffset() {
-    return MemberOffset(DataOffset().Int32Value() +
+    return MemberOffset(DataOffset(sizeof(Object*)).Int32Value() +
                         kStrings * sizeof(Object*));
   }
 
   static MemberOffset ResolvedFieldsOffset() {
-    return MemberOffset(DataOffset().Int32Value() +
+    return MemberOffset(DataOffset(sizeof(Object*)).Int32Value() +
                         kResolvedFields * sizeof(Object*));
   }
 
   static MemberOffset ResolvedMethodsOffset() {
-    return MemberOffset(DataOffset().Int32Value() +
+    return MemberOffset(DataOffset(sizeof(Object*)).Int32Value() +
                         kResolvedMethods * sizeof(Object*));
   }
 
diff --git a/src/hprof/hprof.cc b/src/hprof/hprof.cc
index 66cb24d..be37207 100644
--- a/src/hprof/hprof.cc
+++ b/src/hprof/hprof.cc
@@ -421,7 +421,7 @@
         rec->AddId(LookupClassId(clazz));
 
         // Dump the elements, which are always objects or NULL.
-        rec->AddIdList((const HprofObjectId *)aobj->GetRawData(), length);
+        rec->AddIdList((const HprofObjectId *)aobj->GetRawData(sizeof(Object*)), length);
       } else {
         size_t size;
         HprofBasicType t = PrimitiveToBasicTypeAndSize(clazz->GetComponentType()->GetPrimitiveType(), &size);
@@ -441,13 +441,13 @@
 #if DUMP_PRIM_DATA
         // Dump the raw, packed element values.
         if (size == 1) {
-          rec->AddU1List((const uint8_t *)aobj->GetRawData(), length);
+          rec->AddU1List((const uint8_t *)aobj->GetRawData(sizeof(uint8_t)), length);
         } else if (size == 2) {
-          rec->AddU2List((const uint16_t *)(void *)aobj->GetRawData(), length);
+          rec->AddU2List((const uint16_t *)(void *)aobj->GetRawData(sizeof(uint16_t)), length);
         } else if (size == 4) {
-          rec->AddU4List((const uint32_t *)(void *)aobj->GetRawData(), length);
+          rec->AddU4List((const uint32_t *)(void *)aobj->GetRawData(sizeof(uint32_t)), length);
         } else if (size == 8) {
-          rec->AddU8List((const uint64_t *)aobj->GetRawData(), length);
+          rec->AddU8List((const uint64_t *)aobj->GetRawData(sizeof(uint64_t)), length);
         }
 #endif
       }
diff --git a/src/java_lang_System.cc b/src/java_lang_System.cc
index 78100cf..b48cee9 100644
--- a/src/java_lang_System.cc
+++ b/src/java_lang_System.cc
@@ -148,9 +148,6 @@
     return;
   }
 
-  uint8_t* dstBytes = reinterpret_cast<uint8_t*>(dstArray->GetRawData());
-  const uint8_t* srcBytes = reinterpret_cast<const uint8_t*>(srcArray->GetRawData());
-
   // Handle primitive arrays.
   if (srcComponentType->IsPrimitive() || dstComponentType->IsPrimitive()) {
     // If one of the arrays holds a primitive type the other array must hold the exact same type.
@@ -162,7 +159,11 @@
       return;
     }
 
-    switch (srcArray->GetClass()->GetComponentSize()) {
+    size_t width = srcArray->GetClass()->GetComponentSize();
+    uint8_t* dstBytes = reinterpret_cast<uint8_t*>(dstArray->GetRawData(width));
+    const uint8_t* srcBytes = reinterpret_cast<const uint8_t*>(srcArray->GetRawData(width));
+
+    switch (width) {
     case 1:
       memmove(dstBytes + dstPos, srcBytes + srcPos, length);
       break;
@@ -185,6 +186,8 @@
 
   // Neither class is primitive. Are the types trivially compatible?
   const size_t width = sizeof(Object*);
+  uint8_t* dstBytes = reinterpret_cast<uint8_t*>(dstArray->GetRawData(width));
+  const uint8_t* srcBytes = reinterpret_cast<const uint8_t*>(srcArray->GetRawData(width));
   if (dstArray == srcArray || dstComponentType->IsAssignableFrom(srcComponentType)) {
     // Yes. Bulk copy.
     COMPILE_ASSERT(sizeof(width) == sizeof(uint32_t), move32_assumes_Object_references_are_32_bit);
diff --git a/src/jni_internal.cc b/src/jni_internal.cc
index 60e34ba..04832c7 100644
--- a/src/jni_internal.cc
+++ b/src/jni_internal.cc
@@ -2053,9 +2053,14 @@
     return NewPrimitiveArray<jshortArray, ShortArray>(ts, length);
   }
 
-  static void* GetPrimitiveArrayCritical(JNIEnv* env, jarray array, jboolean* is_copy) {
+  static void* GetPrimitiveArrayCritical(JNIEnv* env, jarray java_array, jboolean* is_copy) {
     ScopedJniThreadState ts(env);
-    return GetPrimitiveArray<jarray, jbyte*, ByteArray>(ts, array, is_copy);
+    Array* array = Decode<Array*>(ts, java_array);
+    PinPrimitiveArray(ts, array);
+    if (is_copy != NULL) {
+      *is_copy = JNI_FALSE;
+    }
+    return array->GetRawData(array->GetClass()->GetComponentSize());
   }
 
   static void ReleasePrimitiveArrayCritical(JNIEnv* env, jarray array, void* data, jint mode) {
diff --git a/src/mark_sweep.cc b/src/mark_sweep.cc
index 48473d5..952bf85 100644
--- a/src/mark_sweep.cc
+++ b/src/mark_sweep.cc
@@ -371,8 +371,9 @@
     const ObjectArray<Object>* array = obj->AsObjectArray<Object>();
     for (int32_t i = 0; i < array->GetLength(); ++i) {
       const Object* element = array->GetWithoutChecks(i);
-      CheckReference(obj, element, MemberOffset(i * sizeof(Object*) +
-                                                Array::DataOffset().Int32Value()), false);
+      size_t width = sizeof(Object*);
+      CheckReference(obj, element, MemberOffset(i * width +
+                                                Array::DataOffset(width).Int32Value()), false);
     }
   }
 }
diff --git a/src/object.cc b/src/object.cc
index 7217549..c9e0929 100644
--- a/src/object.cc
+++ b/src/object.cc
@@ -1252,7 +1252,7 @@
   DCHECK_GE(component_count, 0);
   DCHECK(array_class->IsArrayClass());
 
-  size_t header_size = sizeof(Array);
+  size_t header_size = sizeof(Object) + (component_size == sizeof(int64_t) ? 8 : 4);
   size_t data_size = component_count * component_size;
   size_t size = header_size + data_size;
 
diff --git a/src/object.h b/src/object.h
index 29d8255..ca143ac 100644
--- a/src/object.h
+++ b/src/object.h
@@ -973,12 +973,18 @@
     return OFFSET_OF_OBJECT_MEMBER(Array, length_);
   }
 
-  static MemberOffset DataOffset() {
-    return OFFSET_OF_OBJECT_MEMBER(Array, first_element_);
+  static MemberOffset DataOffset(size_t component_size) {
+    if (component_size != sizeof(int64_t)) {
+      return OFFSET_OF_OBJECT_MEMBER(Array, first_element_);
+    } else {
+      // Align longs and doubles.
+      return MemberOffset(OFFSETOF_MEMBER(Array, first_element_) + 4);
+    }
   }
 
-  void* GetRawData() {
-    return reinterpret_cast<void*>(first_element_);
+  void* GetRawData(size_t component_size) {
+    intptr_t data = reinterpret_cast<intptr_t>(this) + DataOffset(component_size).Int32Value();
+    return reinterpret_cast<void*>(data);
   }
 
  protected:
@@ -996,8 +1002,6 @@
  private:
   // The number of array elements.
   int32_t length_;
-  // Padding to ensure the first member defined by a subclass begins on a 8-byte boundary
-  int32_t padding_;
   // Marker for the data (used by generated code)
   uint32_t first_element_[0];
 
@@ -1031,7 +1035,7 @@
 
 template<class T>
 ObjectArray<T>* ObjectArray<T>::Alloc(Class* object_array_class, int32_t length) {
-  Array* array = Array::Alloc(object_array_class, length, sizeof(uint32_t));
+  Array* array = Array::Alloc(object_array_class, length, sizeof(Object*));
   if (UNLIKELY(array == NULL)) {
     return NULL;
   } else {
@@ -1044,7 +1048,7 @@
   if (UNLIKELY(!IsValidIndex(i))) {
     return NULL;
   }
-  MemberOffset data_offset(DataOffset().Int32Value() + i * sizeof(Object*));
+  MemberOffset data_offset(DataOffset(sizeof(Object*)).Int32Value() + i * sizeof(Object*));
   return GetFieldObject<T*>(data_offset, false);
 }
 
@@ -1982,7 +1986,11 @@
 
 inline size_t Array::SizeOf() const {
   // This is safe from overflow because the array was already allocated, so we know it's sane.
-  return sizeof(Array) + GetLength() * GetClass()->GetComponentSize();
+  size_t component_size = GetClass()->GetComponentSize();
+  int32_t component_count = GetLength();
+  size_t header_size = sizeof(Object) + (component_size == sizeof(int64_t) ? 8 : 4);
+  size_t data_size = component_count * component_size;
+  return header_size + data_size;
 }
 
 template<class T>
@@ -1995,7 +2003,7 @@
         return;
       }
     }
-    MemberOffset data_offset(DataOffset().Int32Value() + i * sizeof(Object*));
+    MemberOffset data_offset(DataOffset(sizeof(Object*)).Int32Value() + i * sizeof(Object*));
     SetFieldObject(data_offset, object, false);
   }
 }
@@ -2003,14 +2011,14 @@
 template<class T>
 void ObjectArray<T>::SetWithoutChecks(int32_t i, T* object) {
   DCHECK(IsValidIndex(i));
-  MemberOffset data_offset(DataOffset().Int32Value() + i * sizeof(Object*));
+  MemberOffset data_offset(DataOffset(sizeof(Object*)).Int32Value() + i * sizeof(Object*));
   SetFieldObject(data_offset, object, false);
 }
 
 template<class T>
 T* ObjectArray<T>::GetWithoutChecks(int32_t i) const {
   DCHECK(IsValidIndex(i));
-  MemberOffset data_offset(DataOffset().Int32Value() + i * sizeof(Object*));
+  MemberOffset data_offset(DataOffset(sizeof(Object*)).Int32Value() + i * sizeof(Object*));
   return GetFieldObject<T*>(data_offset, false);
 }
 
@@ -2022,8 +2030,8 @@
       src->IsValidIndex(src_pos+length-1) &&
       dst->IsValidIndex(dst_pos) &&
       dst->IsValidIndex(dst_pos+length-1)) {
-    MemberOffset src_offset(DataOffset().Int32Value() + src_pos * sizeof(Object*));
-    MemberOffset dst_offset(DataOffset().Int32Value() + dst_pos * sizeof(Object*));
+    MemberOffset src_offset(DataOffset(sizeof(Object*)).Int32Value() + src_pos * sizeof(Object*));
+    MemberOffset dst_offset(DataOffset(sizeof(Object*)).Int32Value() + dst_pos * sizeof(Object*));
     Class* array_class = dst->GetClass();
     if (array_class == src->GetClass()) {
       // No need for array store checks if arrays are of the same type
@@ -2095,11 +2103,13 @@
   static PrimitiveArray<T>* Alloc(size_t length);
 
   const T* GetData() const {
-    return &elements_[0];
+    intptr_t data = reinterpret_cast<intptr_t>(this) + DataOffset(sizeof(T)).Int32Value();
+    return reinterpret_cast<T*>(data);
   }
 
   T* GetData() {
-    return &elements_[0];
+    intptr_t data = reinterpret_cast<intptr_t>(this) + DataOffset(sizeof(T)).Int32Value();
+    return reinterpret_cast<T*>(data);
   }
 
   T Get(int32_t i) const {
@@ -2128,9 +2138,6 @@
   }
 
  private:
-  // Location of first element.
-  T elements_[0];
-
   static Class* array_class_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(PrimitiveArray);
diff --git a/src/runtime_support.cc b/src/runtime_support.cc
index 00c58b5..6ca6e14 100644
--- a/src/runtime_support.cc
+++ b/src/runtime_support.cc
@@ -1203,7 +1203,7 @@
   }
   uint16_t width = table[1];
   uint32_t size_in_bytes = size * width;
-  memcpy((char*)array + Array::DataOffset().Int32Value(), (char*)&table[4], size_in_bytes);
+  memcpy((char*)array + Array::DataOffset(width).Int32Value(), (char*)&table[4], size_in_bytes);
   return 0;  // Success
 }
 
diff --git a/src/sun_misc_Unsafe.cc b/src/sun_misc_Unsafe.cc
index c639f97..68cd4f7 100644
--- a/src/sun_misc_Unsafe.cc
+++ b/src/sun_misc_Unsafe.cc
@@ -24,13 +24,17 @@
 namespace {
 
 jlong Unsafe_objectFieldOffset0(JNIEnv* env, jclass, jobject javaField) {
+  // TODO: move to Java code
   jfieldID fid = env->FromReflectedField(javaField);
   Field* field = DecodeField(fid);
   return field->GetOffset().Int32Value();
 }
 
-jint Unsafe_arrayBaseOffset0(JNIEnv*, jclass, jclass) {
-  return Array::DataOffset().Int32Value();
+jint Unsafe_arrayBaseOffset0(JNIEnv* env, jclass, jclass javaArrayClass) {
+  // TODO: move to Java code
+  ScopedThreadStateChange tsc(Thread::Current(), Thread::kRunnable);
+  Class* array_class = Decode<Class*>(env, javaArrayClass);
+  return Array::DataOffset(array_class->GetComponentSize()).Int32Value();
 }
 
 jint Unsafe_arrayIndexScale0(JNIEnv* env, jclass, jclass javaClass) {