Revert "Revert "Implement rosalloc fast path in assembly for 32 bit arm.""

With a heap poisoning fix.

This reverts commit cf91c7d973f3b2f491abc61d47c141782c96d46e.

Bug: 9986565
Change-Id: Ia72edbde65ef6119e1931a77cc4c595a0b80ce31
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index d09631b..631b784 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -891,7 +891,110 @@
 ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 
 // Generate the allocation entrypoints for each allocator.
-GENERATE_ALL_ALLOC_ENTRYPOINTS
+GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
+ENTRY art_quick_alloc_object_rosalloc
+    // Fast path rosalloc allocation.
+    // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
+    // r2, r3, r12: free.
+    ldr    r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32]    // Load dex cache resolved types array
+                                                              // Load the class (r2)
+    ldr    r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
+    cbz    r2, .Lart_quick_alloc_object_rosalloc_slow_path    // Check null class
+                                                              // Check class status.
+    ldr    r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
+    cmp    r3, #MIRROR_CLASS_STATUS_INITIALIZED
+    bne    .Lart_quick_alloc_object_rosalloc_slow_path
+                                                              // Add a fake dependence from the
+                                                              // following access flag and size
+                                                              // loads to the status load.
+                                                              // This is to prevent those loads
+                                                              // from being reordered above the
+                                                              // status load and reading wrong
+                                                              // values (an alternative is to use
+                                                              // a load-acquire for the status).
+    eor    r3, r3, r3
+    add    r2, r2, r3
+                                                              // Check access flags has
+                                                              // kAccClassIsFinalizable
+    ldr    r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
+    tst    r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
+    bne    .Lart_quick_alloc_object_rosalloc_slow_path
+
+    ldr    r3, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]     // Check if the thread local
+                                                              // allocation stack has room.
+                                                              // TODO: consider using ldrd.
+    ldr    r12, [r9, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
+    cmp    r3, r12
+    bhs    .Lart_quick_alloc_object_rosalloc_slow_path
+
+    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (r3)
+    cmp    r3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE        // Check if the size is for a thread
+                                                              // local allocation
+    bhs    .Lart_quick_alloc_object_rosalloc_slow_path
+                                                              // Compute the rosalloc bracket index
+                                                              // from the size.
+                                                              // Align up the size by the rosalloc
+                                                              // bracket quantum size and divide
+                                                              // by the quantum size and subtract
+                                                              // by 1. This code is a shorter but
+                                                              // equivalent version.
+    sub    r3, r3, #1
+    lsr    r3, r3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
+                                                              // Load the rosalloc run (r12)
+    add    r12, r9, r3, lsl #POINTER_SIZE_SHIFT
+    ldr    r12, [r12, #THREAD_ROSALLOC_RUNS_OFFSET]
+                                                              // Load the free list head (r3). This
+                                                              // will be the return val.
+    ldr    r3, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
+    cbz    r3, .Lart_quick_alloc_object_rosalloc_slow_path
+    // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
+    ldr    r1, [r3, #ROSALLOC_SLOT_NEXT_OFFSET]               // Load the next pointer of the head
+                                                              // and update the list head with the
+                                                              // next pointer.
+    str    r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
+                                                              // Store the class pointer in the
+                                                              // header. This also overwrites the
+                                                              // next pointer. The offsets are
+                                                              // asserted to match.
+#if ROSALLOC_SLOT_NEXT_OFFSET != MIRROR_OBJECT_CLASS_OFFSET
+#error "Class pointer needs to overwrite next pointer."
+#endif
+    POISON_HEAP_REF r2
+    str    r2, [r3, #MIRROR_OBJECT_CLASS_OFFSET]
+                                                              // Push the new object onto the thread
+                                                              // local allocation stack and
+                                                              // increment the thread local
+                                                              // allocation stack top.
+    ldr    r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+    str    r3, [r1], #COMPRESSED_REFERENCE_SIZE               // (Increment r1 as a side effect.)
+    str    r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+                                                              // Decrement the size of the free list
+    ldr    r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
+    sub    r1, #1
+                                                              // TODO: consider combining this store
+                                                              // and the list head store above using
+                                                              // strd.
+    str    r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
+                                                              // Fence. This is "ish" not "ishst" so
+                                                              // that the code after this allocation
+                                                              // site will see the right values in
+                                                              // the fields of the class.
+                                                              // Alternatively we could use "ishst"
+                                                              // if we use load-acquire for the
+                                                              // class status load.)
+    dmb    ish
+    mov    r0, r3                                             // Set the return value and return.
+    bx     lr
+
+.Lart_quick_alloc_object_rosalloc_slow_path:
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  r2, r3  @ save callee saves in case of GC
+    mov    r2, r9                     @ pass Thread::Current
+    bl     artAllocObjectFromCodeRosAlloc     @ (uint32_t type_idx, Method* method, Thread*)
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+END art_quick_alloc_object_rosalloc
 
     /*
      * Called by managed code when the value in rSUSPEND has been decremented to 0.
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S
index ef5edbb..fbacdbc 100644
--- a/runtime/arch/quick_alloc_entrypoints.S
+++ b/runtime/arch/quick_alloc_entrypoints.S
@@ -113,7 +113,8 @@
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_dlmalloc_instrumented, DlMallocInstrumented)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_dlmalloc_instrumented, DlMallocInstrumented)
 
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
+// This is to be separately defined for each architecture to allow a hand-written assembly fast path.
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_rosalloc, RosAlloc)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 4a106e4..2f485ae 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -788,6 +788,7 @@
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
 
 ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 5c413d2..95f0ccb 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -809,6 +809,7 @@
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
 // A handle-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
 DEFINE_FUNCTION art_quick_alloc_object_tlab
     // Fast path tlab allocation.
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index d98fc51..69f6fe9 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -19,6 +19,7 @@
 
 #if defined(__cplusplus)
 #include "art_method.h"
+#include "gc/allocator/rosalloc.h"
 #include "lock_word.h"
 #include "mirror/class.h"
 #include "mirror/string.h"
@@ -53,6 +54,14 @@
 #define ADD_TEST_EQ(x, y)
 #endif
 
+#if defined(__LP64__)
+#define POINTER_SIZE_SHIFT 3
+#else
+#define POINTER_SIZE_SHIFT 2
+#endif
+ADD_TEST_EQ(static_cast<size_t>(1U << POINTER_SIZE_SHIFT),
+            static_cast<size_t>(__SIZEOF_POINTER__))
+
 // Size of references to the heap on the stack.
 #define STACK_REFERENCE_SIZE 4
 ADD_TEST_EQ(static_cast<size_t>(STACK_REFERENCE_SIZE), sizeof(art::StackReference<art::mirror::Object>))
@@ -62,6 +71,10 @@
 ADD_TEST_EQ(static_cast<size_t>(COMPRESSED_REFERENCE_SIZE),
             sizeof(art::mirror::CompressedReference<art::mirror::Object>))
 
+#define COMPRESSED_REFERENCE_SIZE_SHIFT 2
+ADD_TEST_EQ(static_cast<size_t>(1U << COMPRESSED_REFERENCE_SIZE_SHIFT),
+            static_cast<size_t>(COMPRESSED_REFERENCE_SIZE))
+
 // Note: these callee save methods loads require read barriers.
 // Offset of field Runtime::callee_save_methods_[kSaveAll]
 #define RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET 0
@@ -120,6 +133,18 @@
 #define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_POS_OFFSET + 2 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
             art::Thread::ThreadLocalObjectsOffset<__SIZEOF_POINTER__>().Int32Value())
+// Offset of field Thread::tlsPtr_.rosalloc_runs.
+#define THREAD_ROSALLOC_RUNS_OFFSET (THREAD_LOCAL_POS_OFFSET + 3 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_ROSALLOC_RUNS_OFFSET,
+            art::Thread::RosAllocRunsOffset<__SIZEOF_POINTER__>().Int32Value())
+// Offset of field Thread::tlsPtr_.thread_local_alloc_stack_top.
+#define THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET (THREAD_ROSALLOC_RUNS_OFFSET + 34 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET,
+            art::Thread::ThreadLocalAllocStackTopOffset<__SIZEOF_POINTER__>().Int32Value())
+// Offset of field Thread::tlsPtr_.thread_local_alloc_stack_end.
+#define THREAD_LOCAL_ALLOC_STACK_END_OFFSET (THREAD_ROSALLOC_RUNS_OFFSET + 35 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_ALLOC_STACK_END_OFFSET,
+            art::Thread::ThreadLocalAllocStackEndOffset<__SIZEOF_POINTER__>().Int32Value())
 
 // Offsets within java.lang.Object.
 #define MIRROR_OBJECT_CLASS_OFFSET 0
@@ -236,6 +261,44 @@
 ADD_TEST_EQ(static_cast<uint32_t>(OBJECT_ALIGNMENT_MASK_TOGGLED),
             ~static_cast<uint32_t>(art::kObjectAlignment - 1))
 
+#define ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE 128
+ADD_TEST_EQ(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::kMaxThreadLocalBracketSize))
+
+#define ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT 4
+ADD_TEST_EQ(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSizeShift))
+
+#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK 15
+ADD_TEST_EQ(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
+
+#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED32 0xfffffff0
+ADD_TEST_EQ(static_cast<uint32_t>(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED32),
+            ~static_cast<uint32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
+
+#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED64 0xfffffffffffffff0
+ADD_TEST_EQ(static_cast<uint64_t>(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED64),
+            ~static_cast<uint64_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
+
+#define ROSALLOC_RUN_FREE_LIST_OFFSET 8
+ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_OFFSET,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListOffset()))
+
+#define ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET 0
+ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListHeadOffset()))
+
+#define ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET 16
+ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListSizeOffset()))
+
+#define ROSALLOC_SLOT_NEXT_OFFSET 0
+ADD_TEST_EQ(ROSALLOC_SLOT_NEXT_OFFSET,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::RunSlotNextOffset()))
+// Assert this so that we can avoid zeroing the next field by installing the class pointer.
+ADD_TEST_EQ(ROSALLOC_SLOT_NEXT_OFFSET, MIRROR_OBJECT_CLASS_OFFSET)
+
 #if defined(__cplusplus)
 }  // End of CheckAsmSupportOffsets.
 #endif
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 87f1392..3ce3d63 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -131,6 +131,7 @@
 
    private:
     Slot* next_;  // Next slot in the list.
+    friend class RosAlloc;
   };
 
   // We use the tail (kUseTail == true) for the bulk or thread-local free lists to avoid the need to
@@ -302,6 +303,7 @@
     // free without traversing the whole free list.
     uint32_t size_;
     uint32_t padding_ ATTRIBUTE_UNUSED;
+    friend class RosAlloc;
   };
 
   // Represents a run of memory slots of the same size.
@@ -482,7 +484,7 @@
   static constexpr uint8_t kMagicNumFree = 43;
   // The number of size brackets. Sync this with the length of Thread::rosalloc_runs_.
   static constexpr size_t kNumOfSizeBrackets = kNumRosAllocThreadLocalSizeBrackets;
-  // The number of smaller size brackets that are 16 bytes apart.
+  // The number of smaller size brackets that are the quantum size apart.
   static constexpr size_t kNumOfQuantumSizeBrackets = 32;
   // The sizes (the slot sizes, in bytes) of the size brackets.
   static size_t bracketSizes[kNumOfSizeBrackets];
@@ -520,9 +522,7 @@
   }
   // Returns true if the given allocation size is for a thread local allocation.
   static bool IsSizeForThreadLocal(size_t size) {
-    DCHECK_GT(kNumThreadLocalSizeBrackets, 0U);
-    size_t max_thread_local_bracket_idx = kNumThreadLocalSizeBrackets - 1;
-    bool is_size_for_thread_local = size <= bracketSizes[max_thread_local_bracket_idx];
+    bool is_size_for_thread_local = size <= kMaxThreadLocalBracketSize;
     DCHECK(size > kLargeSizeThreshold ||
            (is_size_for_thread_local == (SizeToIndex(size) < kNumThreadLocalSizeBrackets)));
     return is_size_for_thread_local;
@@ -634,6 +634,16 @@
   // are less than this index. We use shared (current) runs for the rest.
   static const size_t kNumThreadLocalSizeBrackets = 8;
 
+  // The size of the largest bracket we use thread-local runs for.
+  // This should be equal to bracketSizes[kNumThreadLocalSizeBrackets - 1].
+  static const size_t kMaxThreadLocalBracketSize = 128;
+
+  // The bracket size increment for the brackets of size <= 512 bytes.
+  static constexpr size_t kBracketQuantumSize = 16;
+
+  // Equal to Log2(kQuantumBracketSizeIncrement).
+  static constexpr size_t kBracketQuantumSizeShift = 4;
+
  private:
   // The base address of the memory region that's managed by this allocator.
   uint8_t* base_;
@@ -770,6 +780,19 @@
            size_t page_release_size_threshold = kDefaultPageReleaseSizeThreshold);
   ~RosAlloc();
 
+  static size_t RunFreeListOffset() {
+    return OFFSETOF_MEMBER(Run, free_list_);
+  }
+  static size_t RunFreeListHeadOffset() {
+    return OFFSETOF_MEMBER(SlotFreeList<false>, head_);
+  }
+  static size_t RunFreeListSizeOffset() {
+    return OFFSETOF_MEMBER(SlotFreeList<false>, size_);
+  }
+  static size_t RunSlotNextOffset() {
+    return OFFSETOF_MEMBER(Slot, next_);
+  }
+
   // If kThreadUnsafe is true then the allocator may avoid acquiring some locks as an optimization.
   // If used, this may cause race conditions if multiple threads are allocating at the same time.
   template<bool kThreadSafe = true>
diff --git a/runtime/thread.h b/runtime/thread.h
index 8cea10c..8f3461a 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -626,6 +626,24 @@
     return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, thread_local_objects));
   }
 
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> RosAllocRunsOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
+                                                                rosalloc_runs));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadLocalAllocStackTopOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
+                                                                thread_local_alloc_stack_top));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadLocalAllocStackEndOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
+                                                                thread_local_alloc_stack_end));
+  }
+
   // Size of stack less any space reserved for stack overflow
   size_t GetStackSize() const {
     return tlsPtr_.stack_size - (tlsPtr_.stack_end - tlsPtr_.stack_begin);