Fix unnecessary read barrier for resolved/initialized allocations

The caller is responsible for the read barrier for these entrypoints.

No measured change in benchmarks, tested MemAllocTest.

Bug: 12687968

Test: test-art-host CC baker, N6P booting with CC
Change-Id: If6b00fa2c31ff51b943dbbe6caea2ef0a3fa1ae0
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 202846a..3f87a14 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2107,7 +2107,7 @@
 END art_quick_alloc_object_tlab
 
 // The common code for art_quick_alloc_object_*region_tlab
-.macro GENERATE_ALLOC_OBJECT_REGION_TLAB name, entrypoint, fast_path, is_resolved
+.macro GENERATE_ALLOC_OBJECT_REGION_TLAB name, entrypoint, fast_path, is_resolved, read_barrier
 ENTRY \name
     // Fast path region tlab allocation.
     // x0: type_idx/resolved class/return value, x1: ArtMethod*, xSELF(x19): Thread::Current
@@ -2123,16 +2123,19 @@
     ldr    x2, [x1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_64]    // Load dex cache resolved types array
                                                               // Load the class (x2)
     ldr    w2, [x2, x0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
+    // If the class is null, go slow path. The check is required to read the lock word.
+    cbz    w2, .Lslow_path\name
 .endif
+.if \read_barrier
     // Most common case: GC is not marking.
     ldr    w3, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
     cbnz   x3, .Lmarking\name
+.endif
 .Ldo_allocation\name:
     \fast_path .Lslow_path\name
 .Lmarking\name:
+.if \read_barrier
     // GC is marking, check the lock word of the class for the mark bit.
-    // If the class is null, go slow path. The check is required to read the lock word.
-    cbz    w2, .Lslow_path\name
     // Class is not null, check mark bit in lock word.
     ldr    w3, [x2, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
     // If the bit is not zero, do the allocation.
@@ -2140,14 +2143,23 @@
                                                               // The read barrier slow path. Mark
                                                               // the class.
     stp    x0, x1, [sp, #-32]!                                // Save registers (x0, x1, lr).
+    .cfi_adjust_cfa_offset 32
+    .cfi_rel_offset x0, 0
+    .cfi_rel_offset x1, 8
     str    xLR, [sp, #16]                                     // Align sp by 16 bytes.
+    .cfi_rel_offset xLR, 16
     mov    x0, x2                                             // Pass the class as the first param.
     bl     artReadBarrierMark
     mov    x2, x0                                             // Get the (marked) class back.
     ldp    x0, x1, [sp, #0]                                   // Restore registers.
+    .cfi_restore x0
+    .cfi_restore x1
     ldr    xLR, [sp, #16]
+    .cfi_restore xLR
     add    sp, sp, #32
+    .cfi_adjust_cfa_offset -32
     b      .Ldo_allocation\name
+.endif
 .Lslow_path\name:
     SETUP_SAVE_REFS_ONLY_FRAME                 // Save callee saves in case of GC.
     mov    x2, xSELF                           // Pass Thread::Current.
@@ -2157,9 +2169,14 @@
 END \name
 .endm
 
-GENERATE_ALLOC_OBJECT_REGION_TLAB art_quick_alloc_object_region_tlab, artAllocObjectFromCodeRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH, 0
-GENERATE_ALLOC_OBJECT_REGION_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED, 1
-GENERATE_ALLOC_OBJECT_REGION_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED, 1
+// Use ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED since the null check is already done in GENERATE_ALLOC_OBJECT_TLAB.
+GENERATE_ALLOC_OBJECT_REGION_TLAB art_quick_alloc_object_region_tlab, artAllocObjectFromCodeRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED, 0, 1
+// No read barrier for the resolved or initialized cases since the caller is responsible for the
+// read barrier due to the to-space invariant.
+GENERATE_ALLOC_OBJECT_REGION_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED, 1, 0
+GENERATE_ALLOC_OBJECT_REGION_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED, 1, 0
+
+// TODO: We could use this macro for the normal tlab allocator too.
 
 // The common code for art_quick_alloc_array_*region_tlab
 .macro GENERATE_ALLOC_ARRAY_REGION_TLAB name, entrypoint, fast_path, is_resolved
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index f941c52..06ff7ab 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1268,28 +1268,9 @@
     int3
     int3
 #endif
+    // No read barrier since the caller is responsible for that.
     movq %rdi, %rdx
-    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
-    jne .Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_marking
-.Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_slow_path_exit:
     ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path
-.Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_marking:
-    // Check the mark bit, if it is 1 avoid the read barrier.
-    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
-    jnz .Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_slow_path_exit
-.Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_slow_path:
-    // The read barrier slow path. Mark the class.
-    PUSH rdi
-    PUSH rsi
-    subq LITERAL(8), %rsp // 16 byte alignment
-    // Outgoing argument set up
-    movq %rdx, %rdi                                            // Pass the class as the first param.
-    call SYMBOL(artReadBarrierMark)                            // cxx_name(mirror::Object* obj)
-    movq %rax, %rdx
-    addq LITERAL(8), %rsp
-    POP rsi
-    POP rdi
-    jmp .Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_slow_path_exit
 .Lart_quick_alloc_object_resolved_region_tlab_slow_path:
     ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedRegionTLAB
 END_FUNCTION art_quick_alloc_object_resolved_region_tlab
@@ -1303,29 +1284,9 @@
     int3
     int3
 #endif
-    // Might need a special macro since rsi and edx is 32b/64b mismatched.
     movq %rdi, %rdx
-    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
-    jne .Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_marking
-.Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_slow_path_exit:
+    // No read barrier since the caller is responsible for that.
     ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_region_tlab_slow_path
-.Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_marking:
-    // Check the mark bit, if it is 1 avoid the read barrier.
-    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
-    jnz .Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_slow_path
-.Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_slow_path:
-    // The read barrier slow path. Mark the class.
-    PUSH rdi
-    PUSH rsi
-    subq LITERAL(8), %rsp // 16 byte alignment
-    // Outgoing argument set up
-    movq %rdx, %rdi                                            // Pass the class as the first param.
-    call SYMBOL(artReadBarrierMark)                            // cxx_name(mirror::Object* obj)
-    movq %rax, %rdx
-    addq LITERAL(8), %rsp
-    POP rsi
-    POP rdi
-    jmp .Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_slow_path_exit
 .Lart_quick_alloc_object_initialized_region_tlab_slow_path:
     ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeInitializedRegionTLAB
 END_FUNCTION art_quick_alloc_object_initialized_region_tlab