Do allocation fence before pushing on allocation stack

Heap::VisitObjects relies on having valid classes for objects in
the allocation stack. If the writes reorder, the thread calling
VisitObjects could see the free list pointer instead of the class
of the object. I believe this is causing crashes in VisitObjects.

Bug: 28790624

Test: Volantis booted

Change-Id: I0f2d4097de1ef3f5caf670ecc977d4d6837872ca
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index e9ad1f4..c893e77 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1633,6 +1633,18 @@
 #endif
     POISON_HEAP_REF w2
     str    w2, [x3, #MIRROR_OBJECT_CLASS_OFFSET]
+                                                              // Fence. This is "ish" not "ishst" so
+                                                              // that it also ensures ordering of
+                                                              // the class status load with respect
+                                                              // to later accesses to the class
+                                                              // object. Alternatively we could use
+                                                              // "ishst" if we use load-acquire for
+                                                              // the class status load.)
+                                                              // Needs to be done before pushing on
+                                                              // allocation since Heap::VisitObjects
+                                                              // relies on seeing the class pointer.
+                                                              // b/28790624
+    dmb    ish
                                                               // Push the new object onto the thread
                                                               // local allocation stack and
                                                               // increment the thread local
@@ -1647,14 +1659,7 @@
                                                               // and the list head store above using
                                                               // strd.
     str    w1, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
-                                                              // Fence. This is "ish" not "ishst" so
-                                                              // that the code after this allocation
-                                                              // site will see the right values in
-                                                              // the fields of the class.
-                                                              // Alternatively we could use "ishst"
-                                                              // if we use load-acquire for the
-                                                              // class status load.)
-    dmb    ish
+
     mov    x0, x3                                             // Set the return value and return.
     ret
 .Lart_quick_alloc_object_rosalloc_slow_path: