Faster allocation fast path

Added a new object size field to class, this field contains the
aligned object size if the object is not finalizable and is
initialized. If the object is finalizable or uninitialized the field
is set to some large value that forces the ASM allocators to go slow
path.

Only implemented for region/normal TLAB for now, will add the to
RosAlloc stubs soon.

CC N6P MemAllocTest: 1067 -> 1039 (25 samples)
CC N6P EAAC: 1281 -> 1260 (25 samples)

RAM overhead technically 0 since mirror::Class was not 8 byte aligned
previously. Since the allocators require 8 byte allignment, there
would have been 1 word of padding at the end of the class. If there
was actually 4 extra bytes per class, the system overhead would be
36000 * 4 = 120KB based on old N6P numbers for the number of loaded
classes after boot.

Bug: 9986565

Test: test-art-host CC baker, N6P phone boot and EAAC runs.

Change-Id: I119a87b8cc6c980bff980a0c62f42610dab5e531
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index e25e93f..bc4c999 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1191,25 +1191,6 @@
 // Need to preserve r0 and r1 to the slow path.
 .macro ALLOC_OBJECT_TLAB_FAST_PATH slowPathLabel
     cbz    r2, \slowPathLabel                                 // Check null class
-                                                              // Check class status.
-    ldr    r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
-    cmp    r3, #MIRROR_CLASS_STATUS_INITIALIZED
-    bne    \slowPathLabel
-                                                              // Add a fake dependence from the
-                                                              // following access flag and size
-                                                              // loads to the status load.
-                                                              // This is to prevent those loads
-                                                              // from being reordered above the
-                                                              // status load and reading wrong
-                                                              // values (an alternative is to use
-                                                              // a load-acquire for the status).
-    eor    r3, r3, r3
-    add    r2, r2, r3
-                                                              // Check access flags has
-                                                              // kAccClassIsFinalizable.
-    ldr    r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
-    tst    r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
-    bne    \slowPathLabel
                                                               // Load thread_local_pos (r12) and
                                                               // thread_local_end (r3) with ldrd.
                                                               // Check constraints for ldrd.
@@ -1218,16 +1199,12 @@
 #endif
     ldrd   r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
     sub    r12, r3, r12                                       // Compute the remaining buf size.
-    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (r3).
+    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (r3).
     cmp    r3, r12                                            // Check if it fits. OK to do this
                                                               // before rounding up the object size
                                                               // assuming the buf size alignment.
     bhi    \slowPathLabel
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
-                                                              // Round up the object size by the
-                                                              // object alignment. (addr + 7) & ~7.
-    add    r3, r3, #OBJECT_ALIGNMENT_MASK
-    and    r3, r3, #OBJECT_ALIGNMENT_MASK_TOGGLED
                                                               // Reload old thread_local_pos (r0)
                                                               // for the return value.
     ldr    r0, [r9, #THREAD_LOCAL_POS_OFFSET]
@@ -1244,7 +1221,7 @@
                                                               // the fields of the class.
                                                               // Alternatively we could use "ishst"
                                                               // if we use load-acquire for the
-                                                              // class status load.)
+                                                              // object size load.)
     dmb    ish
     bx     lr
 .endm
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 3f87a14..76e503c 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2027,48 +2027,24 @@
     ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED \slowPathLabel
 .endm
 
+// TODO: delete ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED since it is the same as
+// ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED.
 .macro ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED slowPathLabel
-    ldr    w3, [x2, #MIRROR_CLASS_STATUS_OFFSET]              // Check class status.
-    cmp    x3, #MIRROR_CLASS_STATUS_INITIALIZED
-    bne    \slowPathLabel
-                                                              // Add a fake dependence from the
-                                                              // following access flag and size
-                                                              // loads to the status load.
-                                                              // This is to prevent those loads
-                                                              // from being reordered above the
-                                                              // status load and reading wrong
-                                                              // values (an alternative is to use
-                                                              // a load-acquire for the status).
-    eor    x3, x3, x3
-    add    x2, x2, x3
     ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED \slowPathLabel
 .endm
 
 .macro ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED slowPathLabel
-                                                              // Check access flags has
-                                                              // kAccClassIsFinalizable.
-    ldr    w3, [x2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
-    tbnz   x3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE_BIT, \slowPathLabel
-                                                              // Load thread_local_pos (x4) and
-                                                              // thread_local_end (x5).
     ldr    x4, [xSELF, #THREAD_LOCAL_POS_OFFSET]
     ldr    x5, [xSELF, #THREAD_LOCAL_END_OFFSET]
-    sub    x6, x5, x4                                         // Compute the remaining buf size.
-    ldr    w7, [x2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (x7).
-    cmp    x7, x6                                             // Check if it fits. OK to do this
-                                                              // before rounding up the object size
-                                                              // assuming the buf size alignment.
+    ldr    w7, [x2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (x7).
+    add    x6, x4, x7                                         // Add object size to tlab pos.
+    cmp    x6, x5                                             // Check if it fits, overflow works
+                                                              // since the tlab pos and end are 32
+                                                              // bit values.
     bhi    \slowPathLabel
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1.
-                                                              // Round up the object size by the
-                                                              // object alignment. (addr + 7) & ~7.
-    add    x7, x7, #OBJECT_ALIGNMENT_MASK
-    and    x7, x7, #OBJECT_ALIGNMENT_MASK_TOGGLED
-                                                              // Move old thread_local_pos to x0
-                                                              // for the return value.
     mov    x0, x4
-    add    x5, x0, x7
-    str    x5, [xSELF, #THREAD_LOCAL_POS_OFFSET]              // Store new thread_local_pos.
+    str    x6, [xSELF, #THREAD_LOCAL_POS_OFFSET]              // Store new thread_local_pos.
     ldr    x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]          // Increment thread_local_objects.
     add    x5, x5, #1
     str    x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
@@ -2080,7 +2056,7 @@
                                                               // the fields of the class.
                                                               // Alternatively we could use "ishst"
                                                               // if we use load-acquire for the
-                                                              // class status load.)
+                                                              // object size load.)
     dmb    ish
     ret
 .endm
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 282f10d..67df081 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -990,32 +990,23 @@
 MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
     testl %edx, %edx                                    // Check null class
     jz   VAR(slowPathLabel)
-                                                        // Check class status.
-    cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%edx)
-    jne  VAR(slowPathLabel)
                                                         // No fake dependence needed on x86
                                                         // between status and flags load,
                                                         // since each load is a load-acquire,
                                                         // no loads reordering.
-                                                        // Check access flags has
-                                                        // kAccClassIsFinalizable
-    testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%edx)
-    jnz  VAR(slowPathLabel)
     movl %fs:THREAD_SELF_OFFSET, %ebx                   // ebx = thread
     movl THREAD_LOCAL_END_OFFSET(%ebx), %edi            // Load thread_local_end.
     subl THREAD_LOCAL_POS_OFFSET(%ebx), %edi            // Compute the remaining buffer size.
-    movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%edx), %esi    // Load the object size.
+    movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%edx), %esi  // Load the object size.
     cmpl %edi, %esi                                     // Check if it fits. OK to do this
                                                         // before rounding up the object size
                                                         // assuming the buf size alignment.
     ja   VAR(slowPathLabel)
-    addl LITERAL(OBJECT_ALIGNMENT_MASK), %esi           // Align the size by 8. (addr + 7) & ~7.
-    andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %esi
     movl THREAD_LOCAL_POS_OFFSET(%ebx), %eax            // Load thread_local_pos
                                                         // as allocated object.
     addl %eax, %esi                                     // Add the object size.
     movl %esi, THREAD_LOCAL_POS_OFFSET(%ebx)            // Update thread_local_pos.
-    addl LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%ebx)  // Increase thread_local_objects.
+    incl THREAD_LOCAL_OBJECTS_OFFSET(%ebx)              // Increase thread_local_objects.
                                                         // Store the class pointer in the header.
                                                         // No fence needed for x86.
     POISON_HEAP_REF edx
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 06ff7ab..c568715 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1020,21 +1020,12 @@
 END_MACRO
 
 // The common fast path code for art_quick_alloc_object_resolved_region_tlab.
+// TODO: delete ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH since it is the same as
+// ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH.
 //
 // RDI: type_idx, RSI: ArtMethod*, RDX/EDX: the class, RAX: return value.
 // RCX: scratch, r8: Thread::Current().
 MACRO1(ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH, slowPathLabel)
-                                                           // Check class status.
-    cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%rdx)
-    jne  RAW_VAR(slowPathLabel)
-                                                           // No fake dependence needed on x86
-                                                           // between status and flags load,
-                                                           // since each load is a load-acquire,
-                                                           // no loads reordering.
-                                                           // Check access flags has
-                                                           // kAccClassIsFinalizable
-    testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%rdx)
-    jnz  RAW_VAR(slowPathLabel)
     ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH(RAW_VAR(slowPathLabel))
 END_MACRO
 
@@ -1044,19 +1035,16 @@
 // RCX: scratch, r8: Thread::Current().
 MACRO1(ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH, slowPathLabel)
     movq %gs:THREAD_SELF_OFFSET, %r8                           // r8 = thread
-    movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %ecx           // Load the object size.
+    movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%rdx), %ecx // Load the object size.
     movq THREAD_LOCAL_POS_OFFSET(%r8), %rax
-    leaq OBJECT_ALIGNMENT_MASK(%rax, %rcx), %rcx               // Add size to pos, note that these
+    addq %rax, %rcx                                            // Add size to pos, note that these
                                                                // are both 32 bit ints, overflow
                                                                // will cause the add to be past the
                                                                // end of the thread local region.
-                                                               // Also sneak in alignment mask add.
-    andq LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED64), %rcx        // Align the size by 8. (addr + 7) &
-                                                               // ~7.
     cmpq THREAD_LOCAL_END_OFFSET(%r8), %rcx                    // Check if it fits.
     ja   RAW_VAR(slowPathLabel)
     movq %rcx, THREAD_LOCAL_POS_OFFSET(%r8)                    // Update thread_local_pos.
-    addq LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%r8)          // Increase thread_local_objects.
+    incq THREAD_LOCAL_OBJECTS_OFFSET(%r8)                      // Increase thread_local_objects.
                                                                // Store the class pointer in the
                                                                // header.
                                                                // No fence needed for x86.
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 102b993..f4addf7 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -178,10 +178,13 @@
 #define MIRROR_CLASS_OBJECT_SIZE_OFFSET (96 + MIRROR_OBJECT_HEADER_SIZE)
 ADD_TEST_EQ(MIRROR_CLASS_OBJECT_SIZE_OFFSET,
             art::mirror::Class::ObjectSizeOffset().Int32Value())
-#define MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET (100 + MIRROR_OBJECT_HEADER_SIZE)
+#define MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET (100 + MIRROR_OBJECT_HEADER_SIZE)
+ADD_TEST_EQ(MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET,
+            art::mirror::Class::ObjectSizeAllocFastPathOffset().Int32Value())
+#define MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET (104 + MIRROR_OBJECT_HEADER_SIZE)
 ADD_TEST_EQ(MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET,
             art::mirror::Class::PrimitiveTypeOffset().Int32Value())
-#define MIRROR_CLASS_STATUS_OFFSET (108 + MIRROR_OBJECT_HEADER_SIZE)
+#define MIRROR_CLASS_STATUS_OFFSET (112 + MIRROR_OBJECT_HEADER_SIZE)
 ADD_TEST_EQ(MIRROR_CLASS_STATUS_OFFSET,
             art::mirror::Class::StatusOffset().Int32Value())
 
diff --git a/runtime/class_linker_test.cc b/runtime/class_linker_test.cc
index e0ff3dd..3be39a1 100644
--- a/runtime/class_linker_test.cc
+++ b/runtime/class_linker_test.cc
@@ -586,6 +586,8 @@
     addOffset(OFFSETOF_MEMBER(mirror::Class, num_reference_static_fields_),
               "numReferenceStaticFields");
     addOffset(OFFSETOF_MEMBER(mirror::Class, object_size_), "objectSize");
+    addOffset(OFFSETOF_MEMBER(mirror::Class, object_size_alloc_fast_path_),
+              "objectSizeAllocFastPath");
     addOffset(OFFSETOF_MEMBER(mirror::Class, primitive_type_), "primitiveType");
     addOffset(OFFSETOF_MEMBER(mirror::Class, reference_instance_offsets_),
               "referenceInstanceOffsets");
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 0f2aac2..26b8e8a 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -861,6 +861,8 @@
   klass->SetPrimitiveType(Primitive::kPrimNot);  // Default to not being primitive.
   klass->SetDexClassDefIndex(DexFile::kDexNoIndex16);  // Default to no valid class def index.
   klass->SetDexTypeIndex(DexFile::kDexNoIndex16);  // Default to no valid type index.
+  // Default to force slow path until initialized.
+  klass->SetObjectSizeAllocFastPath(std::numeric_limits<int32_t>::max());
 }
 
 inline void Class::SetAccessFlags(uint32_t new_access_flags) {
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 96b3345..b60c573 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -100,9 +100,21 @@
   }
   static_assert(sizeof(Status) == sizeof(uint32_t), "Size of status not equal to uint32");
   if (Runtime::Current()->IsActiveTransaction()) {
-    h_this->SetField32Volatile<true>(OFFSET_OF_OBJECT_MEMBER(Class, status_), new_status);
+    h_this->SetField32Volatile<true>(StatusOffset(), new_status);
   } else {
-    h_this->SetField32Volatile<false>(OFFSET_OF_OBJECT_MEMBER(Class, status_), new_status);
+    h_this->SetField32Volatile<false>(StatusOffset(), new_status);
+  }
+
+  // Setting the object size alloc fast path needs to be after the status write so that if the
+  // alloc path sees a valid object size, we would know that it's initialized as long as it has a
+  // load-acquire/fake dependency.
+  if (new_status == kStatusInitialized && !h_this->IsVariableSize()) {
+    uint32_t object_size = RoundUp(h_this->GetObjectSize(), kObjectAlignment);
+    if (h_this->IsFinalizable()) {
+      // Finalizable objects must always go slow path.
+      object_size = std::numeric_limits<int32_t>::max();
+    }
+    h_this->SetObjectSizeAllocFastPath(object_size);
   }
 
   if (!class_linker_initialized) {
@@ -1209,5 +1221,13 @@
   return flags;
 }
 
+void Class::SetObjectSizeAllocFastPath(uint32_t new_object_size) {
+  if (Runtime::Current()->IsActiveTransaction()) {
+    SetField32Volatile<true>(ObjectSizeAllocFastPathOffset(), new_object_size);
+  } else {
+    SetField32Volatile<false>(ObjectSizeAllocFastPathOffset(), new_object_size);
+  }
+}
+
 }  // namespace mirror
 }  // namespace art
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 1751f32..f8f414b 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -586,6 +586,9 @@
   static MemberOffset ObjectSizeOffset() {
     return OFFSET_OF_OBJECT_MEMBER(Class, object_size_);
   }
+  static MemberOffset ObjectSizeAllocFastPathOffset() {
+    return OFFSET_OF_OBJECT_MEMBER(Class, object_size_alloc_fast_path_);
+  }
 
   void SetObjectSize(uint32_t new_object_size) REQUIRES_SHARED(Locks::mutator_lock_) {
     DCHECK(!IsVariableSize());
@@ -593,6 +596,8 @@
     return SetField32<false>(OFFSET_OF_OBJECT_MEMBER(Class, object_size_), new_object_size);
   }
 
+  void SetObjectSizeAllocFastPath(uint32_t new_object_size) REQUIRES_SHARED(Locks::mutator_lock_);
+
   void SetObjectSizeWithoutChecks(uint32_t new_object_size)
       REQUIRES_SHARED(Locks::mutator_lock_) {
     // Not called within a transaction.
@@ -1457,6 +1462,10 @@
   // See also class_size_.
   uint32_t object_size_;
 
+  // Aligned object size for allocation fast path. The value is max int if the object is
+  // uninitialized or finalizable. Not currently used for variable sized objects.
+  uint32_t object_size_alloc_fast_path_;
+
   // The lower 16 bits contains a Primitive::Type value. The upper 16
   // bits contains the size shift of the primitive type.
   uint32_t primitive_type_;