ARM64: Use link-time generated thunks for Baker CC read barrier.

Remaining work for follow-up CLs:
  - array loads,
  - volatile field loads,
  - use implicit null check in field thunk.

Test: Added tests to relative_patcher_arm64
Test: New run-test 160-read-barrier-stress
Test: m test-art-target-gtest on Nexus 6P.
Test: testrunner.py --target on Nexus 6P.
Bug: 29516974
Bug: 30126666
Bug: 36141117
Change-Id: Id68ff171c55a3f1bf1ac1b657f480531aa7b3710
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index b2bbd0d..c7fa7f5 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2494,6 +2494,240 @@
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg28, w28, x28
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, w29, x29
 
+
+.macro SELECT_X_OR_W_FOR_MACRO macro_to_use, x, w, xreg
+    .if \xreg
+      \macro_to_use \x
+    .else
+      \macro_to_use \w
+    .endif
+.endm
+
+.macro FOR_REGISTERS macro_for_register, macro_for_reserved_register, xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x0, w0, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x1, w1, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x2, w2, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x3, w3, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x4, w4, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x5, w5, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x6, w6, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x7, w7, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x8, w8, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x9, w9, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x10, w10, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x11, w11, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x12, w12, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x13, w13, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x14, w14, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x15, w15, \xreg
+    \macro_for_reserved_register  // IP0 is reserved
+    \macro_for_reserved_register  // IP1 is reserved
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x18, w18, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x19, w19, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x20, w20, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x21, w21, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x22, w22, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x23, w23, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x24, w24, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x25, w25, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x26, w26, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x27, w27, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x28, w28, \xreg
+    SELECT_X_OR_W_FOR_MACRO \macro_for_register, x29, w29, \xreg
+    \macro_for_reserved_register  // lr is reserved
+    \macro_for_reserved_register  // sp is reserved
+.endm
+
+.macro FOR_XREGISTERS macro_for_register, macro_for_reserved_register
+    FOR_REGISTERS \macro_for_register, \macro_for_reserved_register, /* xreg */ 1
+.endm
+
+.macro FOR_WREGISTERS macro_for_register, macro_for_reserved_register
+    FOR_REGISTERS \macro_for_register, \macro_for_reserved_register, /* xreg */ 0
+.endm
+
+.macro BRK0_BRK0
+    brk 0
+    brk 0
+.endm
+
+#if BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET != BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
+#error "Array and field introspection code sharing requires same LDR offset."
+#endif
+.macro INTROSPECTION_ARRAY_LOAD index_reg
+    ldr   wIP0, [xIP0, \index_reg, lsl #2]
+    b     art_quick_read_barrier_mark_introspection
+.endm
+
+.macro MOV_WIP0_TO_WREG_AND_BL_LR reg
+    mov   \reg, wIP0
+    br    lr  // Do not use RET as we do not enter the entrypoint with "BL".
+.endm
+
+.macro READ_BARRIER_MARK_INTROSPECTION_SLOW_PATH ldr_offset
+    /*
+     * Allocate 44 stack slots * 8 = 352 bytes:
+     * - 19 slots for core registers X0-15, X18-X19, LR
+     * - 1 slot padding
+     * - 24 slots for floating-point registers D0-D7 and D16-D31
+     */
+    // Save all potentially live caller-save core registers.
+    SAVE_TWO_REGS_INCREASE_FRAME x0, x1, 352
+    SAVE_TWO_REGS  x2,  x3, 16
+    SAVE_TWO_REGS  x4,  x5, 32
+    SAVE_TWO_REGS  x6,  x7, 48
+    SAVE_TWO_REGS  x8,  x9, 64
+    SAVE_TWO_REGS x10, x11, 80
+    SAVE_TWO_REGS x12, x13, 96
+    SAVE_TWO_REGS x14, x15, 112
+    SAVE_TWO_REGS x18, x19, 128       // Skip x16, x17, i.e. IP0, IP1.
+    SAVE_REG      xLR,      144       // Save return address, skip padding at 152.
+    // Save all potentially live caller-save floating-point registers.
+    stp   d0, d1,   [sp, #160]
+    stp   d2, d3,   [sp, #176]
+    stp   d4, d5,   [sp, #192]
+    stp   d6, d7,   [sp, #208]
+    stp   d16, d17, [sp, #224]
+    stp   d18, d19, [sp, #240]
+    stp   d20, d21, [sp, #256]
+    stp   d22, d23, [sp, #272]
+    stp   d24, d25, [sp, #288]
+    stp   d26, d27, [sp, #304]
+    stp   d28, d29, [sp, #320]
+    stp   d30, d31, [sp, #336]
+
+    mov   x0, xIP0
+    bl    artReadBarrierMark          // artReadBarrierMark(obj)
+    mov   xIP0, x0
+
+    // Restore core regs, except x0 and x1 as the return register switch case
+    // address calculation is smoother with an extra register.
+    RESTORE_TWO_REGS  x2,  x3, 16
+    RESTORE_TWO_REGS  x4,  x5, 32
+    RESTORE_TWO_REGS  x6,  x7, 48
+    RESTORE_TWO_REGS  x8,  x9, 64
+    RESTORE_TWO_REGS x10, x11, 80
+    RESTORE_TWO_REGS x12, x13, 96
+    RESTORE_TWO_REGS x14, x15, 112
+    RESTORE_TWO_REGS x18, x19, 128    // Skip x16, x17, i.e. IP0, IP1.
+    RESTORE_REG      xLR,      144    // Restore return address.
+    // Save all potentially live caller-save floating-point registers.
+    stp   d0, d1,   [sp, #160]
+    stp   d2, d3,   [sp, #176]
+    stp   d4, d5,   [sp, #192]
+    stp   d6, d7,   [sp, #208]
+    stp   d16, d17, [sp, #224]
+    stp   d18, d19, [sp, #240]
+    stp   d20, d21, [sp, #256]
+    stp   d22, d23, [sp, #272]
+    stp   d24, d25, [sp, #288]
+    stp   d26, d27, [sp, #304]
+    stp   d28, d29, [sp, #320]
+    stp   d30, d31, [sp, #336]
+
+    ldr   x0, [lr, #\ldr_offset]      // Load the instruction.
+    adr   xIP1, .Lmark_introspection_return_switch
+    bfi   xIP1, x0, #3, #5            // Calculate switch case address.
+    RESTORE_TWO_REGS_DECREASE_FRAME x0, x1, 352
+    br    xIP1
+.endm
+
+    /*
+     * Use introspection to load a reference from the same address as the LDR
+     * instruction in generated code would load (unless loaded by the thunk,
+     * see below), call ReadBarrier::Mark() with that reference if needed
+     * and return it in the same register as the LDR instruction would load.
+     *
+     * The entrypoint is called through a thunk that differs across load kinds.
+     * For field and array loads the LDR instruction in generated code follows
+     * the branch to the thunk, i.e. the LDR is at [LR, #-4], and the thunk
+     * knows the holder and performs the gray bit check, returning to the LDR
+     * instruction if the object is not gray, so this entrypoint no longer
+     * needs to know anything about the holder. For GC root loads, the LDR
+     * instruction in generated code precedes the branch to the thunk (i.e.
+     * the LDR is at [LR, #-8]) and the thunk does not do the gray bit check.
+     *
+     * For field accesses and array loads with a constant index the thunk loads
+     * the reference into IP0 using introspection and calls the main entrypoint,
+     * art_quick_read_barrier_mark_introspection.
+     *
+     * For array accesses with non-constant index, the thunk inserts the bits
+     * 16-21 of the LDR instruction to the entrypoint address, effectively
+     * calculating a switch case label based on the index register (bits 16-20)
+     * and adding an extra offset (bit 21 is set) to differentiate from the
+     * main entrypoint, then moves the base register to IP0 and jumps to the
+     * switch case. Therefore we need to align the main entrypoint to 512 bytes,
+     * accounting for a 256-byte offset followed by 32 array entrypoints
+     * starting at art_quick_read_barrier_mark_introspection_arrays, each
+     * containing an LDR (register) and a branch to the main entrypoint.
+     *
+     * For GC root accesses we cannot use the main entrypoint because of the
+     * different offset where the LDR instruction in generated code is located.
+     * To re-use the same entrypoint pointer in generated code, we make sure
+     * that the gc root entrypoint (a copy of the entrypoint with a different
+     * offset for introspection loads) is located at a known offset (768 bytes,
+     * or BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET) from the main
+     * entrypoint and the GC root thunk adjusts the entrypoint pointer, moves
+     * the root register to IP0 and jumps to the customized entrypoint,
+     * art_quick_read_barrier_mark_introspection_gc_roots. The thunk also
+     * performs all the fast-path checks, so we need just the slow path.
+     *
+     * The code structure is
+     *   art_quick_read_barrier_mark_introspection:
+     *     Up to 256 bytes for the main entrypoint code.
+     *     Padding to 256 bytes if needed.
+     *   art_quick_read_barrier_mark_introspection_arrays:
+     *     Exactly 256 bytes for array load switch cases (32x2 instructions).
+     *   .Lmark_introspection_return_switch:
+     *     Exactly 256 bytes for return switch cases (32x2 instructions).
+     *   art_quick_read_barrier_mark_introspection_gc_roots:
+     *     GC root entrypoint code.
+     */
+    .balign 512
+ENTRY art_quick_read_barrier_mark_introspection
+    // At this point, IP0 contains the reference, IP1 can be freely used.
+    // If reference is null, just return it in the right register.
+    cbz   wIP0, .Lmark_introspection_return
+    // Use wIP1 as temp and check the mark bit of the reference.
+    ldr   wIP1, [xIP0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tbz   wIP1, #LOCK_WORD_MARK_BIT_SHIFT, .Lmark_introspection_unmarked
+.Lmark_introspection_return:
+    // Without an extra register for the return switch case address calculation,
+    // we exploit the high word of the xIP0 to temporarily store the ref_reg*8,
+    // so the return switch below must move wIP0 instead of xIP0 to the register.
+    ldr   wIP1, [lr, #BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET]  // Load the instruction.
+    bfi   xIP0, xIP1, #(32 + 3), #5   // Extract ref_reg*8 to high word in xIP0.
+    adr   xIP1, .Lmark_introspection_return_switch
+    bfxil xIP1, xIP0, #32, #8         // Calculate return switch case address.
+    br    xIP1
+.Lmark_introspection_unmarked:
+    // Check if the top two bits are one, if this is the case it is a forwarding address.
+    tst   wIP1, wIP1, lsl #1
+    bmi   .Lmark_introspection_forwarding_address
+    READ_BARRIER_MARK_INTROSPECTION_SLOW_PATH BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET
+
+.Lmark_introspection_forwarding_address:
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    lsl   wIP0, wIP1, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    b .Lmark_introspection_return
+
+    // We're very close to the alloted 256B for the entrypoint code before the
+    // array switch cases. Should we go a little bit over the limit, we can
+    // move some code after the array switch cases and return switch cases.
+    .balign 256
+    .hidden art_quick_read_barrier_mark_introspection_arrays
+    .global art_quick_read_barrier_mark_introspection_arrays
+art_quick_read_barrier_mark_introspection_arrays:
+    FOR_XREGISTERS INTROSPECTION_ARRAY_LOAD, BRK0_BRK0
+.Lmark_introspection_return_switch:
+    FOR_WREGISTERS MOV_WIP0_TO_WREG_AND_BL_LR, BRK0_BRK0
+    .hidden art_quick_read_barrier_mark_introspection_gc_roots
+    .global art_quick_read_barrier_mark_introspection_gc_roots
+art_quick_read_barrier_mark_introspection_gc_roots:
+    READ_BARRIER_MARK_INTROSPECTION_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET
+END art_quick_read_barrier_mark_introspection
+
 .extern artInvokePolymorphic
 ENTRY art_quick_invoke_polymorphic
     SETUP_SAVE_REFS_AND_ARGS_FRAME                // Save callee saves in case allocation triggers GC.