ARM64: More JIT Code Cache maintenace

Move implementation closer to ARM's sample code.

Test: art/test.py --target
Change-Id: Ic205c700528f45afc281fb20e9336c9737d5f409
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 27501b9..3bee560 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -747,7 +747,7 @@
 #ifdef __aarch64__
 
 static void FlushJitCodeCacheRange(uint8_t* code_ptr,
-                                   uint8_t* writable_ptr ATTRIBUTE_UNUSED,
+                                   uint8_t* writable_ptr,
                                    size_t code_size) {
   // Cache maintenance instructions can cause permission faults when a
   // page is not present (e.g. swapped out or not backed). These
@@ -765,35 +765,63 @@
   // cache-line sizes for big and little cores.
   static const uintptr_t kSafeCacheLineSize = 32;
 
-  // Ensure stores are present in data cache.
-  __asm __volatile("dsb sy");
+  // Ensure stores are present in L1 data cache.
+  __asm __volatile("dsb ish" ::: "memory");
 
-  uintptr_t addr = RoundDown(reinterpret_cast<uintptr_t>(code_ptr), kSafeCacheLineSize);
-  const uintptr_t limit_addr = RoundUp(reinterpret_cast<uintptr_t>(code_ptr) + code_size,
-                                       kSafeCacheLineSize);
   volatile uint8_t mutant;
-  while (addr < limit_addr) {
+
+  // Push dirty cache-lines out to the point of unification (PoU). The
+  // point of unification is the first point in the cache/memory
+  // hierarchy where the instruction cache and data cache have the
+  // same view of memory. The PoU is where an instruction fetch will
+  // fetch the new code generated by the JIT.
+  //
+  // See: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch11s04.html
+  uintptr_t writable_addr = RoundDown(reinterpret_cast<uintptr_t>(writable_ptr),
+                                      kSafeCacheLineSize);
+  uintptr_t writable_end  = RoundUp(reinterpret_cast<uintptr_t>(writable_ptr) + code_size,
+                                    kSafeCacheLineSize);
+  while (writable_addr < writable_end) {
     // Read from the cache-line to minimize the chance that a cache
     // maintenance instruction causes a fault (see kernel bug comment
     // above).
-    mutant = *reinterpret_cast<const uint8_t*>(addr);
+    mutant = *reinterpret_cast<const uint8_t*>(writable_addr);
+
+    // Flush cache-line
+    __asm volatile("dc cvau, %0" :: "r"(writable_addr) : "memory");
+    writable_addr += kSafeCacheLineSize;
+  }
+
+  __asm __volatile("dsb ish" ::: "memory");
+
+  uintptr_t code_addr = RoundDown(reinterpret_cast<uintptr_t>(code_ptr), kSafeCacheLineSize);
+  const uintptr_t code_end = RoundUp(reinterpret_cast<uintptr_t>(code_ptr) + code_size,
+                                     kSafeCacheLineSize);
+  while (code_addr < code_end) {
+    // Read from the cache-line to minimize the chance that a cache
+    // maintenance instruction causes a fault (see kernel bug comment
+    // above).
+    mutant = *reinterpret_cast<const uint8_t*>(code_addr);
 
     // Invalidating the data cache line is only strictly necessary
     // when the JIT code cache has two mappings (the default). We know
     // this cache line is clean so this is just invalidating it (using
-    // "dc ivac" would be preferable, but is privileged).
-    __asm volatile("dc cvau, %0" :: "r"(addr));
+    // "dc ivac" would be preferable, but counts as a write and this
+    // memory may not be mapped write permission).
+    __asm volatile("dc cvau, %0" :: "r"(code_addr) : "memory");
 
     // Invalidate the instruction cache line to force instructions in
     // range to be re-fetched following update.
-    __asm volatile("ic ivau, %0" :: "r"(addr));
+    __asm volatile("ic ivau, %0" :: "r"(code_addr) : "memory");
 
-    addr += kSafeCacheLineSize;
+    code_addr += kSafeCacheLineSize;
   }
 
-  // Drain data and instruction buffers.
-  __asm __volatile("dsb sy");
-  __asm __volatile("isb sy");
+  // Wait for code cache invalidations to complete.
+  __asm __volatile("dsb ish" ::: "memory");
+
+  // Reset fetched instruction stream.
+  __asm __volatile("isb");
 }
 
 #else  // __aarch64