Use the fast path object size for RosAlloc asm stubs

Also address comments. MemAllocTest perf on N5X speedup in the noise.

Bug: 9986565

Test: test-art-host -j32, N5X booting

Change-Id: Ic22ca92aab88b37fd66928949bf11264ee3476dc
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 67df081..ce095b4 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -897,17 +897,6 @@
     movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx
     testl %edx, %edx                                    // Check null class
     jz   .Lart_quick_alloc_object_rosalloc_slow_path
-                                                        // Check class status
-    cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%edx)
-    jne  .Lart_quick_alloc_object_rosalloc_slow_path
-                                                        // No fake dependence needed on x86
-                                                        // between status and flags load,
-                                                        // since each load is a load-acquire,
-                                                        // no loads reordering.
-                                                        // Check access flags has
-                                                        // kAccClassIsFinalizable
-    testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%edx)
-    jnz   .Lart_quick_alloc_object_rosalloc_slow_path
 
     movl %fs:THREAD_SELF_OFFSET, %ebx                   // ebx = thread
                                                         // Check if the thread local allocation
@@ -916,21 +905,19 @@
     cmpl THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%ebx), %edi
     jae  .Lart_quick_alloc_object_rosalloc_slow_path
 
-    movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%edx), %edi    // Load the object size (edi)
+    movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%edx), %edi  // Load the object size (edi)
                                                         // Check if the size is for a thread
-                                                        // local allocation
+                                                        // local allocation. Also does the
+                                                        // finalizable and initialization check.
     cmpl LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %edi
     ja   .Lart_quick_alloc_object_rosalloc_slow_path
-    decl %edi
     shrl LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %edi // Calculate the rosalloc bracket index
                                                             // from object size.
-                                                            // Align up the size by the rosalloc
-                                                            // bracket quantum size and divide
-                                                            // by the quantum size and subtract
-                                                            // by 1. This code is a shorter but
-                                                            // equivalent version.
                                                         // Load thread local rosalloc run (ebx)
-    movl THREAD_ROSALLOC_RUNS_OFFSET(%ebx, %edi, __SIZEOF_POINTER__), %ebx
+                                                        // Subtract __SIZEOF_POINTER__ to subtract
+                                                        // one from edi as there is no 0 byte run
+                                                        // and the size is already aligned.
+    movl (THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)(%ebx, %edi, __SIZEOF_POINTER__), %ebx
                                                         // Load free_list head (edi),
                                                         // this will be the return value.
     movl (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%ebx), %edi
@@ -990,17 +977,11 @@
 MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
     testl %edx, %edx                                    // Check null class
     jz   VAR(slowPathLabel)
-                                                        // No fake dependence needed on x86
-                                                        // between status and flags load,
-                                                        // since each load is a load-acquire,
-                                                        // no loads reordering.
     movl %fs:THREAD_SELF_OFFSET, %ebx                   // ebx = thread
     movl THREAD_LOCAL_END_OFFSET(%ebx), %edi            // Load thread_local_end.
     subl THREAD_LOCAL_POS_OFFSET(%ebx), %edi            // Compute the remaining buffer size.
     movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%edx), %esi  // Load the object size.
-    cmpl %edi, %esi                                     // Check if it fits. OK to do this
-                                                        // before rounding up the object size
-                                                        // assuming the buf size alignment.
+    cmpl %edi, %esi                                     // Check if it fits.
     ja   VAR(slowPathLabel)
     movl THREAD_LOCAL_POS_OFFSET(%ebx), %eax            // Load thread_local_pos
                                                         // as allocated object.