Merge "Use the fast path object size for RosAlloc asm stubs"
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index bc4c999..c51c336 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1086,25 +1086,6 @@
                                                               // Load the class (r2)
     ldr    r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
     cbz    r2, .Lart_quick_alloc_object_rosalloc_slow_path    // Check null class
-                                                              // Check class status.
-    ldr    r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
-    cmp    r3, #MIRROR_CLASS_STATUS_INITIALIZED
-    bne    .Lart_quick_alloc_object_rosalloc_slow_path
-                                                              // Add a fake dependence from the
-                                                              // following access flag and size
-                                                              // loads to the status load.
-                                                              // This is to prevent those loads
-                                                              // from being reordered above the
-                                                              // status load and reading wrong
-                                                              // values (an alternative is to use
-                                                              // a load-acquire for the status).
-    eor    r3, r3, r3
-    add    r2, r2, r3
-                                                              // Check access flags has
-                                                              // kAccClassIsFinalizable
-    ldr    r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
-    tst    r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
-    bne    .Lart_quick_alloc_object_rosalloc_slow_path
 
     ldr    r3, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]     // Check if the thread local
                                                               // allocation stack has room.
@@ -1113,22 +1094,21 @@
     cmp    r3, r12
     bhs    .Lart_quick_alloc_object_rosalloc_slow_path
 
-    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (r3)
+    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (r3)
     cmp    r3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE        // Check if the size is for a thread
-                                                              // local allocation
+                                                              // local allocation. Also does the
+                                                              // initialized and finalizable checks.
     bhs    .Lart_quick_alloc_object_rosalloc_slow_path
                                                               // Compute the rosalloc bracket index
-                                                              // from the size.
-                                                              // Align up the size by the rosalloc
-                                                              // bracket quantum size and divide
-                                                              // by the quantum size and subtract
-                                                              // by 1. This code is a shorter but
-                                                              // equivalent version.
-    sub    r3, r3, #1
-    lsr    r3, r3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
+                                                              // from the size. Since the size is
+                                                              // already aligned we can combine the
+                                                              // two shifts together.
+    add    r12, r9, r3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
+                                                              // Subtract pointer size since ther
+                                                              // are no runs for 0 byte allocations
+                                                              // and the size is already aligned.
                                                               // Load the rosalloc run (r12)
-    add    r12, r9, r3, lsl #POINTER_SIZE_SHIFT
-    ldr    r12, [r12, #THREAD_ROSALLOC_RUNS_OFFSET]
+    ldr    r12, [r12, #(THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)]
                                                               // Load the free list head (r3). This
                                                               // will be the return val.
     ldr    r3, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
@@ -1153,7 +1133,7 @@
                                                               // to later accesses to the class
                                                               // object. Alternatively we could use
                                                               // "ishst" if we use load-acquire for
-                                                              // the class status load.)
+                                                              // the object size load.
                                                               // Needs to be done before pushing on
                                                               // allocation since Heap::VisitObjects
                                                               // relies on seeing the class pointer.
@@ -1200,9 +1180,7 @@
     ldrd   r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
     sub    r12, r3, r12                                       // Compute the remaining buf size.
     ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (r3).
-    cmp    r3, r12                                            // Check if it fits. OK to do this
-                                                              // before rounding up the object size
-                                                              // assuming the buf size alignment.
+    cmp    r3, r12                                            // Check if it fits.
     bhi    \slowPathLabel
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
                                                               // Reload old thread_local_pos (r0)
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 82dac9c..03768af 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1860,47 +1860,27 @@
                                                               // Load the class (x2)
     ldr    w2, [x2, x0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
     cbz    x2, .Lart_quick_alloc_object_rosalloc_slow_path    // Check null class
-                                                              // Check class status.
-    ldr    w3, [x2, #MIRROR_CLASS_STATUS_OFFSET]
-    cmp    x3, #MIRROR_CLASS_STATUS_INITIALIZED
-    bne    .Lart_quick_alloc_object_rosalloc_slow_path
-                                                              // Add a fake dependence from the
-                                                              // following access flag and size
-                                                              // loads to the status load.
-                                                              // This is to prevent those loads
-                                                              // from being reordered above the
-                                                              // status load and reading wrong
-                                                              // values (an alternative is to use
-                                                              // a load-acquire for the status).
-    eor    x3, x3, x3
-    add    x2, x2, x3
-                                                              // Check access flags has
-                                                              // kAccClassIsFinalizable
-    ldr    w3, [x2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
-    tst    x3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
-    bne    .Lart_quick_alloc_object_rosalloc_slow_path
     ldr    x3, [xSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]  // Check if the thread local
                                                               // allocation stack has room.
                                                               // ldp won't work due to large offset.
     ldr    x4, [xSELF, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
     cmp    x3, x4
     bhs    .Lart_quick_alloc_object_rosalloc_slow_path
-    ldr    w3, [x2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (x3)
+    ldr    w3, [x2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (x3)
     cmp    x3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE        // Check if the size is for a thread
-                                                              // local allocation
+                                                              // local allocation. Also does the
+                                                              // finalizable and initialization
+                                                              // checks.
     bhs    .Lart_quick_alloc_object_rosalloc_slow_path
                                                               // Compute the rosalloc bracket index
-                                                              // from the size.
-                                                              // Align up the size by the rosalloc
-                                                              // bracket quantum size and divide
-                                                              // by the quantum size and subtract
-                                                              // by 1. This code is a shorter but
-                                                              // equivalent version.
-    sub    x3, x3, #1
-    lsr    x3, x3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
-                                                              // Load the rosalloc run (x4)
-    add    x4, xSELF, x3, lsl #POINTER_SIZE_SHIFT
-    ldr    x4, [x4, #THREAD_ROSALLOC_RUNS_OFFSET]
+                                                              // from the size. Since the size is
+                                                              // already aligned we can combine the
+                                                              // two shifts together.
+    add    x4, xSELF, x3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
+                                                              // Subtract pointer size since ther
+                                                              // are no runs for 0 byte allocations
+                                                              // and the size is already aligned.
+    ldr    x4, [x4, #(THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)]
                                                               // Load the free list head (x3). This
                                                               // will be the return val.
     ldr    x3, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
@@ -1921,11 +1901,11 @@
     str    w2, [x3, #MIRROR_OBJECT_CLASS_OFFSET]
                                                               // Fence. This is "ish" not "ishst" so
                                                               // that it also ensures ordering of
-                                                              // the class status load with respect
+                                                              // the object size load with respect
                                                               // to later accesses to the class
                                                               // object. Alternatively we could use
                                                               // "ishst" if we use load-acquire for
-                                                              // the class status load.)
+                                                              // the class status load.
                                                               // Needs to be done before pushing on
                                                               // allocation since Heap::VisitObjects
                                                               // relies on seeing the class pointer.
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 9c22245..67ebf50 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -897,17 +897,6 @@
     movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx
     testl %edx, %edx                                    // Check null class
     jz   .Lart_quick_alloc_object_rosalloc_slow_path
-                                                        // Check class status
-    cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%edx)
-    jne  .Lart_quick_alloc_object_rosalloc_slow_path
-                                                        // No fake dependence needed on x86
-                                                        // between status and flags load,
-                                                        // since each load is a load-acquire,
-                                                        // no loads reordering.
-                                                        // Check access flags has
-                                                        // kAccClassIsFinalizable
-    testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%edx)
-    jnz   .Lart_quick_alloc_object_rosalloc_slow_path
 
     movl %fs:THREAD_SELF_OFFSET, %ebx                   // ebx = thread
                                                         // Check if the thread local allocation
@@ -916,21 +905,19 @@
     cmpl THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%ebx), %edi
     jae  .Lart_quick_alloc_object_rosalloc_slow_path
 
-    movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%edx), %edi    // Load the object size (edi)
+    movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%edx), %edi  // Load the object size (edi)
                                                         // Check if the size is for a thread
-                                                        // local allocation
+                                                        // local allocation. Also does the
+                                                        // finalizable and initialization check.
     cmpl LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %edi
     ja   .Lart_quick_alloc_object_rosalloc_slow_path
-    decl %edi
     shrl LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %edi // Calculate the rosalloc bracket index
                                                             // from object size.
-                                                            // Align up the size by the rosalloc
-                                                            // bracket quantum size and divide
-                                                            // by the quantum size and subtract
-                                                            // by 1. This code is a shorter but
-                                                            // equivalent version.
                                                         // Load thread local rosalloc run (ebx)
-    movl THREAD_ROSALLOC_RUNS_OFFSET(%ebx, %edi, __SIZEOF_POINTER__), %ebx
+                                                        // Subtract __SIZEOF_POINTER__ to subtract
+                                                        // one from edi as there is no 0 byte run
+                                                        // and the size is already aligned.
+    movl (THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)(%ebx, %edi, __SIZEOF_POINTER__), %ebx
                                                         // Load free_list head (edi),
                                                         // this will be the return value.
     movl (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%ebx), %edi
@@ -990,17 +977,11 @@
 MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
     testl %edx, %edx                                    // Check null class
     jz   VAR(slowPathLabel)
-                                                        // No fake dependence needed on x86
-                                                        // between status and flags load,
-                                                        // since each load is a load-acquire,
-                                                        // no loads reordering.
     movl %fs:THREAD_SELF_OFFSET, %ebx                   // ebx = thread
     movl THREAD_LOCAL_END_OFFSET(%ebx), %edi            // Load thread_local_end.
     subl THREAD_LOCAL_POS_OFFSET(%ebx), %edi            // Compute the remaining buffer size.
     movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%edx), %esi  // Load the object size.
-    cmpl %edi, %esi                                     // Check if it fits. OK to do this
-                                                        // before rounding up the object size
-                                                        // assuming the buf size alignment.
+    cmpl %edi, %esi                                     // Check if it fits.
     ja   VAR(slowPathLabel)
     movl THREAD_LOCAL_POS_OFFSET(%ebx), %eax            // Load thread_local_pos
                                                         // as allocated object.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index c568715..b805703 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -935,17 +935,6 @@
     movl   0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
     testl  %edx, %edx                                      // Check null class
     jz     .Lart_quick_alloc_object_rosalloc_slow_path
-                                                           // Check class status.
-    cmpl   LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%rdx)
-    jne    .Lart_quick_alloc_object_rosalloc_slow_path
-                                                           // We don't need a fence (between the
-                                                           // the status and the access flag
-                                                           // loads) here because every load is
-                                                           // a load acquire on x86.
-                                                           // Check access flags has
-                                                           // kAccClassIsFinalizable
-    testl  LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%rdx)
-    jnz    .Lart_quick_alloc_object_rosalloc_slow_path
                                                            // Check if the thread local
                                                            // allocation stack has room.
     movq   %gs:THREAD_SELF_OFFSET, %r8                     // r8 = thread
@@ -953,22 +942,21 @@
     cmpq   THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%r8), %rcx
     jae    .Lart_quick_alloc_object_rosalloc_slow_path
                                                            // Load the object size
-    movl   MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %eax
+    movl   MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%rdx), %eax
                                                            // Check if the size is for a thread
-                                                           // local allocation
+                                                           // local allocation. Also does the
+                                                           // initialized and finalizable checks.
     cmpl   LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %eax
     ja     .Lart_quick_alloc_object_rosalloc_slow_path
                                                            // Compute the rosalloc bracket index
                                                            // from the size.
-                                                           // Align up the size by the rosalloc
-                                                           // bracket quantum size and divide
-                                                           // by the quantum size and subtract
-                                                           // by 1. This code is a shorter but
-                                                           // equivalent version.
-    subq   LITERAL(1), %rax
     shrq   LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %rax
                                                            // Load the rosalloc run (r9)
-    movq   THREAD_ROSALLOC_RUNS_OFFSET(%r8, %rax, __SIZEOF_POINTER__), %r9
+                                                           // Subtract __SIZEOF_POINTER__ to
+                                                           // subtract one from edi as there is no
+                                                           // 0 byte run and the size is already
+                                                           // aligned.
+    movq   (THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)(%r8, %rax, __SIZEOF_POINTER__), %r9
                                                            // Load the free list head (rax). This
                                                            // will be the return val.
     movq   (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%r9), %rax
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 26b8e8a..d1d8caa 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -42,11 +42,19 @@
 template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline uint32_t Class::GetObjectSize() {
   // Note: Extra parentheses to avoid the comma being interpreted as macro parameter separator.
-  DCHECK((!IsVariableSize<kVerifyFlags, kReadBarrierOption>())) << " class=" << PrettyTypeOf(this);
+  DCHECK((!IsVariableSize<kVerifyFlags, kReadBarrierOption>())) << "class=" << PrettyTypeOf(this);
   return GetField32(ObjectSizeOffset());
 }
 
 template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
+inline uint32_t Class::GetObjectSizeAllocFastPath() {
+  // Note: Extra parentheses to avoid the comma being interpreted as macro parameter separator.
+  DCHECK((!IsVariableSize<kVerifyFlags, kReadBarrierOption>())) << "class=" << PrettyTypeOf(this);
+  return GetField32(ObjectSizeAllocFastPathOffset());
+}
+
+
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline Class* Class::GetSuperClass() {
   // Can only get super class for loaded classes (hack for when runtime is
   // initializing)
@@ -862,7 +870,7 @@
   klass->SetDexClassDefIndex(DexFile::kDexNoIndex16);  // Default to no valid class def index.
   klass->SetDexTypeIndex(DexFile::kDexNoIndex16);  // Default to no valid type index.
   // Default to force slow path until initialized.
-  klass->SetObjectSizeAllocFastPath(std::numeric_limits<int32_t>::max());
+  klass->SetObjectSizeAllocFastPath(std::numeric_limits<uint32_t>::max());
 }
 
 inline void Class::SetAccessFlags(uint32_t new_access_flags) {
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index b60c573..c979c28 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -109,12 +109,11 @@
   // alloc path sees a valid object size, we would know that it's initialized as long as it has a
   // load-acquire/fake dependency.
   if (new_status == kStatusInitialized && !h_this->IsVariableSize()) {
-    uint32_t object_size = RoundUp(h_this->GetObjectSize(), kObjectAlignment);
-    if (h_this->IsFinalizable()) {
-      // Finalizable objects must always go slow path.
-      object_size = std::numeric_limits<int32_t>::max();
+    DCHECK_EQ(h_this->GetObjectSizeAllocFastPath(), std::numeric_limits<uint32_t>::max());
+    // Finalizable objects must always go slow path.
+    if (!h_this->IsFinalizable()) {
+      h_this->SetObjectSizeAllocFastPath(RoundUp(h_this->GetObjectSize(), kObjectAlignment));
     }
-    h_this->SetObjectSizeAllocFastPath(object_size);
   }
 
   if (!class_linker_initialized) {
@@ -149,7 +148,7 @@
   if (kIsDebugBuild && new_class_size < GetClassSize()) {
     DumpClass(LOG(INTERNAL_FATAL), kDumpClassFullDetail);
     LOG(INTERNAL_FATAL) << new_class_size << " vs " << GetClassSize();
-    LOG(FATAL) << " class=" << PrettyTypeOf(this);
+    LOG(FATAL) << "class=" << PrettyTypeOf(this);
   }
   // Not called within a transaction.
   SetField32<false>(OFFSET_OF_OBJECT_MEMBER(Class, class_size_), new_class_size);
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index f8f414b..99b7769 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -598,6 +598,10 @@
 
   void SetObjectSizeAllocFastPath(uint32_t new_object_size) REQUIRES_SHARED(Locks::mutator_lock_);
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
+  uint32_t GetObjectSizeAllocFastPath() REQUIRES_SHARED(Locks::mutator_lock_);
+
   void SetObjectSizeWithoutChecks(uint32_t new_object_size)
       REQUIRES_SHARED(Locks::mutator_lock_) {
     // Not called within a transaction.
@@ -1462,7 +1466,7 @@
   // See also class_size_.
   uint32_t object_size_;
 
-  // Aligned object size for allocation fast path. The value is max int if the object is
+  // Aligned object size for allocation fast path. The value is max uint32_t if the object is
   // uninitialized or finalizable. Not currently used for variable sized objects.
   uint32_t object_size_alloc_fast_path_;