Use the fast path object size for RosAlloc asm stubs

Also address comments. MemAllocTest perf on N5X speedup in the noise.

Bug: 9986565

Test: test-art-host -j32, N5X booting

Change-Id: Ic22ca92aab88b37fd66928949bf11264ee3476dc
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 82dac9c..03768af 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1860,47 +1860,27 @@
                                                               // Load the class (x2)
     ldr    w2, [x2, x0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
     cbz    x2, .Lart_quick_alloc_object_rosalloc_slow_path    // Check null class
-                                                              // Check class status.
-    ldr    w3, [x2, #MIRROR_CLASS_STATUS_OFFSET]
-    cmp    x3, #MIRROR_CLASS_STATUS_INITIALIZED
-    bne    .Lart_quick_alloc_object_rosalloc_slow_path
-                                                              // Add a fake dependence from the
-                                                              // following access flag and size
-                                                              // loads to the status load.
-                                                              // This is to prevent those loads
-                                                              // from being reordered above the
-                                                              // status load and reading wrong
-                                                              // values (an alternative is to use
-                                                              // a load-acquire for the status).
-    eor    x3, x3, x3
-    add    x2, x2, x3
-                                                              // Check access flags has
-                                                              // kAccClassIsFinalizable
-    ldr    w3, [x2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
-    tst    x3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
-    bne    .Lart_quick_alloc_object_rosalloc_slow_path
     ldr    x3, [xSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]  // Check if the thread local
                                                               // allocation stack has room.
                                                               // ldp won't work due to large offset.
     ldr    x4, [xSELF, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
     cmp    x3, x4
     bhs    .Lart_quick_alloc_object_rosalloc_slow_path
-    ldr    w3, [x2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (x3)
+    ldr    w3, [x2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (x3)
     cmp    x3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE        // Check if the size is for a thread
-                                                              // local allocation
+                                                              // local allocation. Also does the
+                                                              // finalizable and initialization
+                                                              // checks.
     bhs    .Lart_quick_alloc_object_rosalloc_slow_path
                                                               // Compute the rosalloc bracket index
-                                                              // from the size.
-                                                              // Align up the size by the rosalloc
-                                                              // bracket quantum size and divide
-                                                              // by the quantum size and subtract
-                                                              // by 1. This code is a shorter but
-                                                              // equivalent version.
-    sub    x3, x3, #1
-    lsr    x3, x3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
-                                                              // Load the rosalloc run (x4)
-    add    x4, xSELF, x3, lsl #POINTER_SIZE_SHIFT
-    ldr    x4, [x4, #THREAD_ROSALLOC_RUNS_OFFSET]
+                                                              // from the size. Since the size is
+                                                              // already aligned we can combine the
+                                                              // two shifts together.
+    add    x4, xSELF, x3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
+                                                              // Subtract pointer size since ther
+                                                              // are no runs for 0 byte allocations
+                                                              // and the size is already aligned.
+    ldr    x4, [x4, #(THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)]
                                                               // Load the free list head (x3). This
                                                               // will be the return val.
     ldr    x3, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
@@ -1921,11 +1901,11 @@
     str    w2, [x3, #MIRROR_OBJECT_CLASS_OFFSET]
                                                               // Fence. This is "ish" not "ishst" so
                                                               // that it also ensures ordering of
-                                                              // the class status load with respect
+                                                              // the object size load with respect
                                                               // to later accesses to the class
                                                               // object. Alternatively we could use
                                                               // "ishst" if we use load-acquire for
-                                                              // the class status load.)
+                                                              // the class status load.
                                                               // Needs to be done before pushing on
                                                               // allocation since Heap::VisitObjects
                                                               // relies on seeing the class pointer.