ART: Arm64 optimizing compiler intrinsics

Implement most intrinsics for the optimizing compiler for Arm64.

Change-Id: Idb459be09f0524cb9aeab7a5c7fccb1c6b65a707
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 1f561b7..cc7bf3c 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -16,9 +16,12 @@
 
 #include "code_generator_arm64.h"
 
+#include "common_arm64.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "gc/accounting/card_table.h"
+#include "intrinsics.h"
+#include "intrinsics_arm64.h"
 #include "mirror/array-inl.h"
 #include "mirror/art_method.h"
 #include "mirror/class.h"
@@ -35,175 +38,37 @@
 #error "ARM64 Codegen VIXL macro-assembler macro already defined."
 #endif
 
-
 namespace art {
 
 namespace arm64 {
 
-// TODO: Tune the use of Load-Acquire, Store-Release vs Data Memory Barriers.
-// For now we prefer the use of load-acquire, store-release over explicit memory barriers.
-static constexpr bool kUseAcquireRelease = true;
+using helpers::CPURegisterFrom;
+using helpers::DRegisterFrom;
+using helpers::FPRegisterFrom;
+using helpers::HeapOperand;
+using helpers::HeapOperandFrom;
+using helpers::InputCPURegisterAt;
+using helpers::InputFPRegisterAt;
+using helpers::InputRegisterAt;
+using helpers::InputOperandAt;
+using helpers::Int64ConstantFrom;
+using helpers::Is64BitType;
+using helpers::IsFPType;
+using helpers::IsIntegralType;
+using helpers::LocationFrom;
+using helpers::OperandFromMemOperand;
+using helpers::OutputCPURegister;
+using helpers::OutputFPRegister;
+using helpers::OutputRegister;
+using helpers::RegisterFrom;
+using helpers::StackOperandFrom;
+using helpers::VIXLRegCodeFromART;
+using helpers::WRegisterFrom;
+using helpers::XRegisterFrom;
+
 static constexpr size_t kHeapRefSize = sizeof(mirror::HeapReference<mirror::Object>);
 static constexpr int kCurrentMethodStackOffset = 0;
 
-namespace {
-
-bool IsFPType(Primitive::Type type) {
-  return type == Primitive::kPrimFloat || type == Primitive::kPrimDouble;
-}
-
-bool IsIntegralType(Primitive::Type type) {
-  switch (type) {
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-    case Primitive::kPrimInt:
-    case Primitive::kPrimLong:
-      return true;
-    default:
-      return false;
-  }
-}
-
-bool Is64BitType(Primitive::Type type) {
-  return type == Primitive::kPrimLong || type == Primitive::kPrimDouble;
-}
-
-// Convenience helpers to ease conversion to and from VIXL operands.
-static_assert((SP == 31) && (WSP == 31) && (XZR == 32) && (WZR == 32),
-              "Unexpected values for register codes.");
-
-int VIXLRegCodeFromART(int code) {
-  if (code == SP) {
-    return vixl::kSPRegInternalCode;
-  }
-  if (code == XZR) {
-    return vixl::kZeroRegCode;
-  }
-  return code;
-}
-
-int ARTRegCodeFromVIXL(int code) {
-  if (code == vixl::kSPRegInternalCode) {
-    return SP;
-  }
-  if (code == vixl::kZeroRegCode) {
-    return XZR;
-  }
-  return code;
-}
-
-Register XRegisterFrom(Location location) {
-  DCHECK(location.IsRegister());
-  return Register::XRegFromCode(VIXLRegCodeFromART(location.reg()));
-}
-
-Register WRegisterFrom(Location location) {
-  DCHECK(location.IsRegister());
-  return Register::WRegFromCode(VIXLRegCodeFromART(location.reg()));
-}
-
-Register RegisterFrom(Location location, Primitive::Type type) {
-  DCHECK(type != Primitive::kPrimVoid && !IsFPType(type));
-  return type == Primitive::kPrimLong ? XRegisterFrom(location) : WRegisterFrom(location);
-}
-
-Register OutputRegister(HInstruction* instr) {
-  return RegisterFrom(instr->GetLocations()->Out(), instr->GetType());
-}
-
-Register InputRegisterAt(HInstruction* instr, int input_index) {
-  return RegisterFrom(instr->GetLocations()->InAt(input_index),
-                      instr->InputAt(input_index)->GetType());
-}
-
-FPRegister DRegisterFrom(Location location) {
-  DCHECK(location.IsFpuRegister());
-  return FPRegister::DRegFromCode(location.reg());
-}
-
-FPRegister SRegisterFrom(Location location) {
-  DCHECK(location.IsFpuRegister());
-  return FPRegister::SRegFromCode(location.reg());
-}
-
-FPRegister FPRegisterFrom(Location location, Primitive::Type type) {
-  DCHECK(IsFPType(type));
-  return type == Primitive::kPrimDouble ? DRegisterFrom(location) : SRegisterFrom(location);
-}
-
-FPRegister OutputFPRegister(HInstruction* instr) {
-  return FPRegisterFrom(instr->GetLocations()->Out(), instr->GetType());
-}
-
-FPRegister InputFPRegisterAt(HInstruction* instr, int input_index) {
-  return FPRegisterFrom(instr->GetLocations()->InAt(input_index),
-                        instr->InputAt(input_index)->GetType());
-}
-
-CPURegister CPURegisterFrom(Location location, Primitive::Type type) {
-  return IsFPType(type) ? CPURegister(FPRegisterFrom(location, type))
-                        : CPURegister(RegisterFrom(location, type));
-}
-
-CPURegister OutputCPURegister(HInstruction* instr) {
-  return IsFPType(instr->GetType()) ? static_cast<CPURegister>(OutputFPRegister(instr))
-                                    : static_cast<CPURegister>(OutputRegister(instr));
-}
-
-CPURegister InputCPURegisterAt(HInstruction* instr, int index) {
-  return IsFPType(instr->InputAt(index)->GetType())
-      ? static_cast<CPURegister>(InputFPRegisterAt(instr, index))
-      : static_cast<CPURegister>(InputRegisterAt(instr, index));
-}
-
-int64_t Int64ConstantFrom(Location location) {
-  HConstant* instr = location.GetConstant();
-  return instr->IsIntConstant() ? instr->AsIntConstant()->GetValue()
-                                : instr->AsLongConstant()->GetValue();
-}
-
-Operand OperandFrom(Location location, Primitive::Type type) {
-  if (location.IsRegister()) {
-    return Operand(RegisterFrom(location, type));
-  } else {
-    return Operand(Int64ConstantFrom(location));
-  }
-}
-
-Operand InputOperandAt(HInstruction* instr, int input_index) {
-  return OperandFrom(instr->GetLocations()->InAt(input_index),
-                     instr->InputAt(input_index)->GetType());
-}
-
-MemOperand StackOperandFrom(Location location) {
-  return MemOperand(sp, location.GetStackIndex());
-}
-
-MemOperand HeapOperand(const Register& base, size_t offset = 0) {
-  // A heap reference must be 32bit, so fit in a W register.
-  DCHECK(base.IsW());
-  return MemOperand(base.X(), offset);
-}
-
-MemOperand HeapOperand(const Register& base, Offset offset) {
-  return HeapOperand(base, offset.SizeValue());
-}
-
-MemOperand HeapOperandFrom(Location location, Offset offset) {
-  return HeapOperand(RegisterFrom(location, Primitive::kPrimNot), offset);
-}
-
-Location LocationFrom(const Register& reg) {
-  return Location::RegisterLocation(ARTRegCodeFromVIXL(reg.code()));
-}
-
-Location LocationFrom(const FPRegister& fpreg) {
-  return Location::FpuRegisterLocation(fpreg.code());
-}
-
-}  // namespace
-
 inline Condition ARM64Condition(IfCondition cond) {
   switch (cond) {
     case kCondEQ: return eq;
@@ -264,20 +129,6 @@
 #define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()->
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, x).Int32Value()
 
-class SlowPathCodeARM64 : public SlowPathCode {
- public:
-  SlowPathCodeARM64() : entry_label_(), exit_label_() {}
-
-  vixl::Label* GetEntryLabel() { return &entry_label_; }
-  vixl::Label* GetExitLabel() { return &exit_label_; }
-
- private:
-  vixl::Label entry_label_;
-  vixl::Label exit_label_;
-
-  DISALLOW_COPY_AND_ASSIGN(SlowPathCodeARM64);
-};
-
 class BoundsCheckSlowPathARM64 : public SlowPathCodeARM64 {
  public:
   BoundsCheckSlowPathARM64(HBoundsCheck* instruction,
@@ -602,7 +453,7 @@
   }
 
   int frame_size = GetFrameSize();
-  __ Str(w0, MemOperand(sp, -frame_size, PreIndex));
+  __ Str(kArtMethodRegister, MemOperand(sp, -frame_size, PreIndex));
   __ PokeCPURegList(GetFramePreservedRegisters(), frame_size - FrameEntrySpillSize());
 
   // Stack layout:
@@ -978,12 +829,11 @@
   Register temp_base = temps.AcquireX();
   Primitive::Type type = instruction->GetType();
 
-  DCHECK(!src.IsRegisterOffset());
   DCHECK(!src.IsPreIndex());
   DCHECK(!src.IsPostIndex());
 
   // TODO(vixl): Let the MacroAssembler handle MemOperand.
-  __ Add(temp_base, src.base(), src.offset());
+  __ Add(temp_base, src.base(), OperandFromMemOperand(src));
   MemOperand base = MemOperand(temp_base);
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -1058,12 +908,12 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register temp_base = temps.AcquireX();
 
-  DCHECK(!dst.IsRegisterOffset());
   DCHECK(!dst.IsPreIndex());
   DCHECK(!dst.IsPostIndex());
 
   // TODO(vixl): Let the MacroAssembler handle this.
-  __ Add(temp_base, dst.base(), dst.offset());
+  Operand op = OperandFromMemOperand(dst);
+  __ Add(temp_base, dst.base(), op);
   MemOperand base = MemOperand(temp_base);
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -1956,19 +1806,37 @@
 }
 
 void LocationsBuilderARM64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
+  IntrinsicLocationsBuilderARM64 intrinsic(GetGraph()->GetArena());
+  if (intrinsic.TryDispatch(invoke)) {
+    return;
+  }
+
   HandleInvoke(invoke);
 }
 
 void LocationsBuilderARM64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  IntrinsicLocationsBuilderARM64 intrinsic(GetGraph()->GetArena());
+  if (intrinsic.TryDispatch(invoke)) {
+    return;
+  }
+
   HandleInvoke(invoke);
 }
 
-void InstructionCodeGeneratorARM64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
-  Register temp = WRegisterFrom(invoke->GetLocations()->GetTemp(0));
-  // Make sure that ArtMethod* is passed in W0 as per the calling convention
-  DCHECK(temp.Is(w0));
+static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorARM64* codegen) {
+  if (invoke->GetLocations()->Intrinsified()) {
+    IntrinsicCodeGeneratorARM64 intrinsic(codegen);
+    intrinsic.Dispatch(invoke);
+    return true;
+  }
+  return false;
+}
+
+void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Register temp) {
+  // Make sure that ArtMethod* is passed in kArtMethodRegister as per the calling convention.
+  DCHECK(temp.Is(kArtMethodRegister));
   size_t index_in_cache = mirror::Array::DataOffset(kHeapRefSize).SizeValue() +
-    invoke->GetDexMethodIndex() * kHeapRefSize;
+      invoke->GetDexMethodIndex() * kHeapRefSize;
 
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
@@ -1978,22 +1846,35 @@
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
 
   // temp = method;
-  codegen_->LoadCurrentMethod(temp);
+  LoadCurrentMethod(temp);
   // temp = temp->dex_cache_resolved_methods_;
   __ Ldr(temp, HeapOperand(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset()));
   // temp = temp[index_in_cache];
   __ Ldr(temp, HeapOperand(temp, index_in_cache));
   // lr = temp->entry_point_from_quick_compiled_code_;
   __ Ldr(lr, HeapOperand(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-                          kArm64WordSize)));
+      kArm64WordSize)));
   // lr();
   __ Blr(lr);
 
-  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
-  DCHECK(!codegen_->IsLeafMethod());
+  RecordPcInfo(invoke, invoke->GetDexPc());
+  DCHECK(!IsLeafMethod());
+}
+
+void InstructionCodeGeneratorARM64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
+  if (TryGenerateIntrinsicCode(invoke, codegen_)) {
+    return;
+  }
+
+  Register temp = WRegisterFrom(invoke->GetLocations()->GetTemp(0));
+  codegen_->GenerateStaticOrDirectCall(invoke, temp);
 }
 
 void InstructionCodeGeneratorARM64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
+  if (TryGenerateIntrinsicCode(invoke, codegen_)) {
+    return;
+  }
+
   LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   Register temp = WRegisterFrom(invoke->GetLocations()->GetTemp(0));
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 96013e5..100dafe 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -31,7 +31,10 @@
 namespace arm64 {
 
 class CodeGeneratorARM64;
-class SlowPathCodeARM64;
+
+// TODO: Tune the use of Load-Acquire, Store-Release vs Data Memory Barriers.
+// For now we prefer the use of load-acquire, store-release over explicit memory barriers.
+static constexpr bool kUseAcquireRelease = true;
 
 // Use a local definition to prevent copying mistakes.
 static constexpr size_t kArm64WordSize = kArm64PointerSize;
@@ -45,7 +48,8 @@
 };
 static constexpr size_t kParameterFPRegistersLength = arraysize(kParameterFPRegisters);
 
-const vixl::Register tr = vixl::x18;        // Thread Register
+const vixl::Register tr = vixl::x18;                        // Thread Register
+static const vixl::Register kArtMethodRegister = vixl::w0;  // Method register on invoke.
 
 const vixl::CPURegList vixl_reserved_core_registers(vixl::ip0, vixl::ip1);
 const vixl::CPURegList vixl_reserved_fp_registers(vixl::d31);
@@ -56,6 +60,20 @@
 
 Location ARM64ReturnLocation(Primitive::Type return_type);
 
+class SlowPathCodeARM64 : public SlowPathCode {
+ public:
+  SlowPathCodeARM64() : entry_label_(), exit_label_() {}
+
+  vixl::Label* GetEntryLabel() { return &entry_label_; }
+  vixl::Label* GetExitLabel() { return &exit_label_; }
+
+ private:
+  vixl::Label entry_label_;
+  vixl::Label exit_label_;
+
+  DISALLOW_COPY_AND_ASSIGN(SlowPathCodeARM64);
+};
+
 class InvokeDexCallingConvention : public CallingConvention<vixl::Register, vixl::FPRegister> {
  public:
   InvokeDexCallingConvention()
@@ -274,6 +292,8 @@
     return false;
   }
 
+  void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, vixl::Register temp);
+
  private:
   // Labels for each block that will be compiled.
   vixl::Label* block_labels_;
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 1ac2ab7..c30f4c2 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -37,8 +37,6 @@
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
 static constexpr size_t kParameterFloatRegistersLength = arraysize(kParameterFloatRegisters);
 
-static constexpr bool kCoalescedImplicitNullCheck = false;
-
 class InvokeDexCallingConvention : public CallingConvention<Register, FloatRegister> {
  public:
   InvokeDexCallingConvention() : CallingConvention(
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
new file mode 100644
index 0000000..7077f98
--- /dev/null
+++ b/compiler/optimizing/common_arm64.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_COMMON_ARM64_H_
+#define ART_COMPILER_OPTIMIZING_COMMON_ARM64_H_
+
+#include "locations.h"
+#include "nodes.h"
+#include "utils/arm64/assembler_arm64.h"
+#include "a64/disasm-a64.h"
+#include "a64/macro-assembler-a64.h"
+
+namespace art {
+namespace arm64 {
+namespace helpers {
+
+constexpr bool IsFPType(Primitive::Type type) {
+  return type == Primitive::kPrimFloat || type == Primitive::kPrimDouble;
+}
+
+static inline bool IsIntegralType(Primitive::Type type) {
+  switch (type) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      return true;
+    default:
+      return false;
+  }
+}
+
+constexpr bool Is64BitType(Primitive::Type type) {
+  return type == Primitive::kPrimLong || type == Primitive::kPrimDouble;
+}
+
+// Convenience helpers to ease conversion to and from VIXL operands.
+static_assert((SP == 31) && (WSP == 31) && (XZR == 32) && (WZR == 32),
+              "Unexpected values for register codes.");
+
+static inline int VIXLRegCodeFromART(int code) {
+  if (code == SP) {
+    return vixl::kSPRegInternalCode;
+  }
+  if (code == XZR) {
+    return vixl::kZeroRegCode;
+  }
+  return code;
+}
+
+static inline int ARTRegCodeFromVIXL(int code) {
+  if (code == vixl::kSPRegInternalCode) {
+    return SP;
+  }
+  if (code == vixl::kZeroRegCode) {
+    return XZR;
+  }
+  return code;
+}
+
+static inline vixl::Register XRegisterFrom(Location location) {
+  DCHECK(location.IsRegister());
+  return vixl::Register::XRegFromCode(VIXLRegCodeFromART(location.reg()));
+}
+
+static inline vixl::Register WRegisterFrom(Location location) {
+  DCHECK(location.IsRegister());
+  return vixl::Register::WRegFromCode(VIXLRegCodeFromART(location.reg()));
+}
+
+static inline vixl::Register RegisterFrom(Location location, Primitive::Type type) {
+  DCHECK(type != Primitive::kPrimVoid && !IsFPType(type));
+  return type == Primitive::kPrimLong ? XRegisterFrom(location) : WRegisterFrom(location);
+}
+
+static inline vixl::Register OutputRegister(HInstruction* instr) {
+  return RegisterFrom(instr->GetLocations()->Out(), instr->GetType());
+}
+
+static inline vixl::Register InputRegisterAt(HInstruction* instr, int input_index) {
+  return RegisterFrom(instr->GetLocations()->InAt(input_index),
+                      instr->InputAt(input_index)->GetType());
+}
+
+static inline vixl::FPRegister DRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegister());
+  return vixl::FPRegister::DRegFromCode(location.reg());
+}
+
+static inline vixl::FPRegister SRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegister());
+  return vixl::FPRegister::SRegFromCode(location.reg());
+}
+
+static inline vixl::FPRegister FPRegisterFrom(Location location, Primitive::Type type) {
+  DCHECK(IsFPType(type));
+  return type == Primitive::kPrimDouble ? DRegisterFrom(location) : SRegisterFrom(location);
+}
+
+static inline vixl::FPRegister OutputFPRegister(HInstruction* instr) {
+  return FPRegisterFrom(instr->GetLocations()->Out(), instr->GetType());
+}
+
+static inline vixl::FPRegister InputFPRegisterAt(HInstruction* instr, int input_index) {
+  return FPRegisterFrom(instr->GetLocations()->InAt(input_index),
+                        instr->InputAt(input_index)->GetType());
+}
+
+static inline vixl::CPURegister CPURegisterFrom(Location location, Primitive::Type type) {
+  return IsFPType(type) ? vixl::CPURegister(FPRegisterFrom(location, type))
+                        : vixl::CPURegister(RegisterFrom(location, type));
+}
+
+static inline vixl::CPURegister OutputCPURegister(HInstruction* instr) {
+  return IsFPType(instr->GetType()) ? static_cast<vixl::CPURegister>(OutputFPRegister(instr))
+                                    : static_cast<vixl::CPURegister>(OutputRegister(instr));
+}
+
+static inline vixl::CPURegister InputCPURegisterAt(HInstruction* instr, int index) {
+  return IsFPType(instr->InputAt(index)->GetType())
+      ? static_cast<vixl::CPURegister>(InputFPRegisterAt(instr, index))
+      : static_cast<vixl::CPURegister>(InputRegisterAt(instr, index));
+}
+
+static inline int64_t Int64ConstantFrom(Location location) {
+  HConstant* instr = location.GetConstant();
+  return instr->IsIntConstant() ? instr->AsIntConstant()->GetValue()
+                                : instr->AsLongConstant()->GetValue();
+}
+
+static inline vixl::Operand OperandFrom(Location location, Primitive::Type type) {
+  if (location.IsRegister()) {
+    return vixl::Operand(RegisterFrom(location, type));
+  } else {
+    return vixl::Operand(Int64ConstantFrom(location));
+  }
+}
+
+static inline vixl::Operand InputOperandAt(HInstruction* instr, int input_index) {
+  return OperandFrom(instr->GetLocations()->InAt(input_index),
+                     instr->InputAt(input_index)->GetType());
+}
+
+static inline vixl::MemOperand StackOperandFrom(Location location) {
+  return vixl::MemOperand(vixl::sp, location.GetStackIndex());
+}
+
+static inline vixl::MemOperand HeapOperand(const vixl::Register& base, size_t offset = 0) {
+  // A heap reference must be 32bit, so fit in a W register.
+  DCHECK(base.IsW());
+  return vixl::MemOperand(base.X(), offset);
+}
+
+static inline vixl::MemOperand HeapOperand(const vixl::Register& base, Offset offset) {
+  return HeapOperand(base, offset.SizeValue());
+}
+
+static inline vixl::MemOperand HeapOperandFrom(Location location, Offset offset) {
+  return HeapOperand(RegisterFrom(location, Primitive::kPrimNot), offset);
+}
+
+static inline Location LocationFrom(const vixl::Register& reg) {
+  return Location::RegisterLocation(ARTRegCodeFromVIXL(reg.code()));
+}
+
+static inline Location LocationFrom(const vixl::FPRegister& fpreg) {
+  return Location::FpuRegisterLocation(fpreg.code());
+}
+
+static inline vixl::Operand OperandFromMemOperand(const vixl::MemOperand& mem_op) {
+  if (mem_op.IsImmediateOffset()) {
+    return vixl::Operand(mem_op.offset());
+  } else {
+    DCHECK(mem_op.IsRegisterOffset());
+    if (mem_op.extend() != vixl::NO_EXTEND) {
+      return vixl::Operand(mem_op.regoffset(), mem_op.extend(), mem_op.shift_amount());
+    } else if (mem_op.shift() != vixl::NO_SHIFT) {
+      return vixl::Operand(mem_op.regoffset(), mem_op.shift(), mem_op.shift_amount());
+    } else {
+      LOG(FATAL) << "Should not reach here";
+      UNREACHABLE();
+    }
+  }
+}
+
+}  // namespace helpers
+}  // namespace arm64
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_COMMON_ARM64_H_
diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc
index fe0e7f2..36cf856 100644
--- a/compiler/optimizing/intrinsics.cc
+++ b/compiler/optimizing/intrinsics.cc
@@ -47,25 +47,25 @@
   if (is_op_size) {
     switch (static_cast<OpSize>(data)) {
       case kSignedByte:
-        return Primitive::Type::kPrimByte;
+        return Primitive::kPrimByte;
       case kSignedHalf:
-        return Primitive::Type::kPrimShort;
+        return Primitive::kPrimShort;
       case k32:
-        return Primitive::Type::kPrimInt;
+        return Primitive::kPrimInt;
       case k64:
-        return Primitive::Type::kPrimLong;
+        return Primitive::kPrimLong;
       default:
         LOG(FATAL) << "Unknown/unsupported op size " << data;
         UNREACHABLE();
     }
   } else {
     if ((data & kIntrinsicFlagIsLong) != 0) {
-      return Primitive::Type::kPrimLong;
+      return Primitive::kPrimLong;
     }
     if ((data & kIntrinsicFlagIsObject) != 0) {
-      return Primitive::Type::kPrimNot;
+      return Primitive::kPrimNot;
     }
-    return Primitive::Type::kPrimInt;
+    return Primitive::kPrimInt;
   }
 }
 
@@ -82,9 +82,9 @@
     // Bit manipulations.
     case kIntrinsicReverseBits:
       switch (GetType(method.d.data, true)) {
-        case Primitive::Type::kPrimInt:
+        case Primitive::kPrimInt:
           return Intrinsics::kIntegerReverse;
-        case Primitive::Type::kPrimLong:
+        case Primitive::kPrimLong:
           return Intrinsics::kLongReverse;
         default:
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
@@ -93,11 +93,11 @@
       break;
     case kIntrinsicReverseBytes:
       switch (GetType(method.d.data, true)) {
-        case Primitive::Type::kPrimShort:
+        case Primitive::kPrimShort:
           return Intrinsics::kShortReverseBytes;
-        case Primitive::Type::kPrimInt:
+        case Primitive::kPrimInt:
           return Intrinsics::kIntegerReverseBytes;
-        case Primitive::Type::kPrimLong:
+        case Primitive::kPrimLong:
           return Intrinsics::kLongReverseBytes;
         default:
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
@@ -154,13 +154,13 @@
     // Memory.peek.
     case kIntrinsicPeek:
       switch (GetType(method.d.data, true)) {
-        case Primitive::Type::kPrimByte:
+        case Primitive::kPrimByte:
           return Intrinsics::kMemoryPeekByte;
-        case Primitive::Type::kPrimShort:
+        case Primitive::kPrimShort:
           return Intrinsics::kMemoryPeekShortNative;
-        case Primitive::Type::kPrimInt:
+        case Primitive::kPrimInt:
           return Intrinsics::kMemoryPeekIntNative;
-        case Primitive::Type::kPrimLong:
+        case Primitive::kPrimLong:
           return Intrinsics::kMemoryPeekLongNative;
         default:
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
@@ -171,13 +171,13 @@
     // Memory.poke.
     case kIntrinsicPoke:
       switch (GetType(method.d.data, true)) {
-        case Primitive::Type::kPrimByte:
+        case Primitive::kPrimByte:
           return Intrinsics::kMemoryPokeByte;
-        case Primitive::Type::kPrimShort:
+        case Primitive::kPrimShort:
           return Intrinsics::kMemoryPokeShortNative;
-        case Primitive::Type::kPrimInt:
+        case Primitive::kPrimInt:
           return Intrinsics::kMemoryPokeIntNative;
-        case Primitive::Type::kPrimLong:
+        case Primitive::kPrimLong:
           return Intrinsics::kMemoryPokeLongNative;
         default:
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
@@ -199,11 +199,11 @@
 
     case kIntrinsicCas:
       switch (GetType(method.d.data, false)) {
-        case Primitive::Type::kPrimNot:
+        case Primitive::kPrimNot:
           return Intrinsics::kUnsafeCASObject;
-        case Primitive::Type::kPrimInt:
+        case Primitive::kPrimInt:
           return Intrinsics::kUnsafeCASInt;
-        case Primitive::Type::kPrimLong:
+        case Primitive::kPrimLong:
           return Intrinsics::kUnsafeCASLong;
         default:
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
@@ -213,10 +213,12 @@
     case kIntrinsicUnsafeGet: {
       const bool is_volatile = (method.d.data & kIntrinsicFlagIsVolatile);
       switch (GetType(method.d.data, false)) {
-        case Primitive::Type::kPrimInt:
+        case Primitive::kPrimInt:
           return is_volatile ? Intrinsics::kUnsafeGetVolatile : Intrinsics::kUnsafeGet;
-        case Primitive::Type::kPrimLong:
+        case Primitive::kPrimLong:
           return is_volatile ? Intrinsics::kUnsafeGetLongVolatile : Intrinsics::kUnsafeGetLong;
+        case Primitive::kPrimNot:
+          return is_volatile ? Intrinsics::kUnsafeGetObjectVolatile : Intrinsics::kUnsafeGetObject;
         default:
           LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
           UNREACHABLE();
@@ -230,7 +232,7 @@
           ((method.d.data & kIntrinsicFlagIsOrdered) != 0)  ? kOrdered :
                                                               kNoSync;
       switch (GetType(method.d.data, false)) {
-        case Primitive::Type::kPrimInt:
+        case Primitive::kPrimInt:
           switch (sync) {
             case kNoSync:
               return Intrinsics::kUnsafePut;
@@ -240,7 +242,7 @@
               return Intrinsics::kUnsafePutOrdered;
           }
           break;
-        case Primitive::Type::kPrimLong:
+        case Primitive::kPrimLong:
           switch (sync) {
             case kNoSync:
               return Intrinsics::kUnsafePutLong;
@@ -250,7 +252,7 @@
               return Intrinsics::kUnsafePutLongOrdered;
           }
           break;
-        case Primitive::Type::kPrimNot:
+        case Primitive::kPrimNot:
           switch (sync) {
             case kNoSync:
               return Intrinsics::kUnsafePutObject;
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
new file mode 100644
index 0000000..6d10544
--- /dev/null
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -0,0 +1,1001 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "intrinsics_arm64.h"
+
+#include "code_generator_arm64.h"
+#include "common_arm64.h"
+#include "entrypoints/quick/quick_entrypoints.h"
+#include "intrinsics.h"
+#include "mirror/array-inl.h"
+#include "mirror/art_method.h"
+#include "mirror/string.h"
+#include "thread.h"
+#include "utils/arm64/assembler_arm64.h"
+#include "utils/arm64/constants_arm64.h"
+
+#include "a64/disasm-a64.h"
+#include "a64/macro-assembler-a64.h"
+
+using namespace vixl;   // NOLINT(build/namespaces)
+
+namespace art {
+
+namespace arm64 {
+
+using helpers::DRegisterFrom;
+using helpers::FPRegisterFrom;
+using helpers::HeapOperand;
+using helpers::IsIntegralType;
+using helpers::RegisterFrom;
+using helpers::SRegisterFrom;
+using helpers::WRegisterFrom;
+using helpers::XRegisterFrom;
+
+
+namespace {
+
+ALWAYS_INLINE inline MemOperand AbsoluteHeapOperandFrom(Location location, size_t offset = 0) {
+  return MemOperand(XRegisterFrom(location), offset);
+}
+
+}  // namespace
+
+vixl::MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() {
+  return codegen_->GetAssembler()->vixl_masm_;
+}
+
+ArenaAllocator* IntrinsicCodeGeneratorARM64::GetAllocator() {
+  return codegen_->GetGraph()->GetArena();
+}
+
+#define __ codegen->GetAssembler()->vixl_masm_->
+
+static void MoveFromReturnRegister(Location trg,
+                                   Primitive::Type type,
+                                   CodeGeneratorARM64* codegen) {
+  if (!trg.IsValid()) {
+    DCHECK(type == Primitive::kPrimVoid);
+    return;
+  }
+
+  DCHECK_NE(type, Primitive::kPrimVoid);
+
+  if (IsIntegralType(type)) {
+    Register trg_reg = RegisterFrom(trg, type);
+    Register res_reg = RegisterFrom(ARM64ReturnLocation(type), type);
+    __ Mov(trg_reg, res_reg, kDiscardForSameWReg);
+  } else {
+    FPRegister trg_reg = FPRegisterFrom(trg, type);
+    FPRegister res_reg = FPRegisterFrom(ARM64ReturnLocation(type), type);
+    __ Fmov(trg_reg, res_reg);
+  }
+}
+
+static void MoveArguments(HInvoke* invoke, ArenaAllocator* arena, CodeGeneratorARM64* codegen) {
+  if (invoke->InputCount() == 0) {
+    return;
+  }
+
+  LocationSummary* locations = invoke->GetLocations();
+  InvokeDexCallingConventionVisitor calling_convention_visitor;
+
+  // We're moving potentially two or more locations to locations that could overlap, so we need
+  // a parallel move resolver.
+  HParallelMove parallel_move(arena);
+
+  for (size_t i = 0; i < invoke->InputCount(); i++) {
+    HInstruction* input = invoke->InputAt(i);
+    Location cc_loc = calling_convention_visitor.GetNextLocation(input->GetType());
+    Location actual_loc = locations->InAt(i);
+
+    parallel_move.AddMove(actual_loc, cc_loc, nullptr);
+  }
+
+  codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+}
+
+// Slow-path for fallback (calling the managed code to handle the intrinsic) in an intrinsified
+// call. This will copy the arguments into the positions for a regular call.
+//
+// Note: The actual parameters are required to be in the locations given by the invoke's location
+//       summary. If an intrinsic modifies those locations before a slowpath call, they must be
+//       restored!
+class IntrinsicSlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  explicit IntrinsicSlowPathARM64(HInvoke* invoke) : invoke_(invoke) { }
+
+  void EmitNativeCode(CodeGenerator* codegen_in) OVERRIDE {
+    CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
+    __ Bind(GetEntryLabel());
+
+    codegen->SaveLiveRegisters(invoke_->GetLocations());
+
+    MoveArguments(invoke_, codegen->GetGraph()->GetArena(), codegen);
+
+    if (invoke_->IsInvokeStaticOrDirect()) {
+      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), kArtMethodRegister);
+    } else {
+      UNIMPLEMENTED(FATAL) << "Non-direct intrinsic slow-path not yet implemented";
+      UNREACHABLE();
+    }
+
+    // Copy the result back to the expected output.
+    Location out = invoke_->GetLocations()->Out();
+    if (out.IsValid()) {
+      DCHECK(out.IsRegister());  // TODO: Replace this when we support output in memory.
+      DCHECK(!invoke_->GetLocations()->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
+      MoveFromReturnRegister(out, invoke_->GetType(), codegen);
+    }
+
+    codegen->RestoreLiveRegisters(invoke_->GetLocations());
+    __ B(GetExitLabel());
+  }
+
+ private:
+  // The instruction where this slow path is happening.
+  HInvoke* const invoke_;
+
+  DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathARM64);
+};
+
+#undef __
+
+bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) {
+  Dispatch(invoke);
+  LocationSummary* res = invoke->GetLocations();
+  return res != nullptr && res->Intrinsified();
+}
+
+#define __ masm->
+
+static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresFpuRegister());
+}
+
+static void MoveFPToInt(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) {
+  Location input = locations->InAt(0);
+  Location output = locations->Out();
+  __ Fmov(is64bit ? XRegisterFrom(output) : WRegisterFrom(output),
+          is64bit ? DRegisterFrom(input) : SRegisterFrom(input));
+}
+
+static void MoveIntToFP(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) {
+  Location input = locations->InAt(0);
+  Location output = locations->Out();
+  __ Fmov(is64bit ? DRegisterFrom(output) : SRegisterFrom(output),
+          is64bit ? XRegisterFrom(input) : WRegisterFrom(input));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
+  CreateFPToIntLocations(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
+  CreateIntToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
+  MoveFPToInt(invoke->GetLocations(), true, GetVIXLAssembler());
+}
+void IntrinsicCodeGeneratorARM64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
+  MoveIntToFP(invoke->GetLocations(), true, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
+  CreateFPToIntLocations(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
+  CreateIntToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
+  MoveFPToInt(invoke->GetLocations(), false, GetVIXLAssembler());
+}
+void IntrinsicCodeGeneratorARM64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
+  MoveIntToFP(invoke->GetLocations(), false, GetVIXLAssembler());
+}
+
+static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+static void GenReverseBytes(LocationSummary* locations,
+                            Primitive::Type type,
+                            vixl::MacroAssembler* masm) {
+  Location in = locations->InAt(0);
+  Location out = locations->Out();
+
+  switch (type) {
+    case Primitive::kPrimShort:
+      __ Rev16(WRegisterFrom(out), WRegisterFrom(in));
+      __ Sxth(WRegisterFrom(out), WRegisterFrom(out));
+      break;
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      __ Rev(RegisterFrom(out, type), RegisterFrom(in, type));
+      break;
+    default:
+      LOG(FATAL) << "Unexpected size for reverse-bytes: " << type;
+      UNREACHABLE();
+  }
+}
+
+void IntrinsicLocationsBuilderARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
+  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimInt, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitLongReverseBytes(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitLongReverseBytes(HInvoke* invoke) {
+  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimLong, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitShortReverseBytes(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitShortReverseBytes(HInvoke* invoke) {
+  GenReverseBytes(invoke->GetLocations(), Primitive::kPrimShort, GetVIXLAssembler());
+}
+
+static void GenReverse(LocationSummary* locations,
+                       Primitive::Type type,
+                       vixl::MacroAssembler* masm) {
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
+
+  Location in = locations->InAt(0);
+  Location out = locations->Out();
+
+  __ Rbit(RegisterFrom(out, type), RegisterFrom(in, type));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitIntegerReverse(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitIntegerReverse(HInvoke* invoke) {
+  GenReverse(invoke->GetLocations(), Primitive::kPrimInt, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitLongReverse(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitLongReverse(HInvoke* invoke) {
+  GenReverse(invoke->GetLocations(), Primitive::kPrimLong, GetVIXLAssembler());
+}
+
+static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  // We only support FP registers here.
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+}
+
+static void MathAbsFP(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) {
+  Location in = locations->InAt(0);
+  Location out = locations->Out();
+
+  FPRegister in_reg = is64bit ? DRegisterFrom(in) : SRegisterFrom(in);
+  FPRegister out_reg = is64bit ? DRegisterFrom(out) : SRegisterFrom(out);
+
+  __ Fabs(out_reg, in_reg);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathAbsDouble(HInvoke* invoke) {
+  CreateFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathAbsDouble(HInvoke* invoke) {
+  MathAbsFP(invoke->GetLocations(), true, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathAbsFloat(HInvoke* invoke) {
+  CreateFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathAbsFloat(HInvoke* invoke) {
+  MathAbsFP(invoke->GetLocations(), false, GetVIXLAssembler());
+}
+
+static void CreateIntToInt(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+static void GenAbsInteger(LocationSummary* locations,
+                          bool is64bit,
+                          vixl::MacroAssembler* masm) {
+  Location in = locations->InAt(0);
+  Location output = locations->Out();
+
+  Register in_reg = is64bit ? XRegisterFrom(in) : WRegisterFrom(in);
+  Register out_reg = is64bit ? XRegisterFrom(output) : WRegisterFrom(output);
+
+  __ Cmp(in_reg, Operand(0));
+  __ Cneg(out_reg, in_reg, lt);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathAbsInt(HInvoke* invoke) {
+  CreateIntToInt(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathAbsInt(HInvoke* invoke) {
+  GenAbsInteger(invoke->GetLocations(), false, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathAbsLong(HInvoke* invoke) {
+  CreateIntToInt(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathAbsLong(HInvoke* invoke) {
+  GenAbsInteger(invoke->GetLocations(), true, GetVIXLAssembler());
+}
+
+static void GenMinMaxFP(LocationSummary* locations,
+                        bool is_min,
+                        bool is_double,
+                        vixl::MacroAssembler* masm) {
+  Location op1 = locations->InAt(0);
+  Location op2 = locations->InAt(1);
+  Location out = locations->Out();
+
+  FPRegister op1_reg = is_double ? DRegisterFrom(op1) : SRegisterFrom(op1);
+  FPRegister op2_reg = is_double ? DRegisterFrom(op2) : SRegisterFrom(op2);
+  FPRegister out_reg = is_double ? DRegisterFrom(out) : SRegisterFrom(out);
+  if (is_min) {
+    __ Fmin(out_reg, op1_reg, op2_reg);
+  } else {
+    __ Fmax(out_reg, op1_reg, op2_reg);
+  }
+}
+
+static void CreateFPFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathMinDoubleDouble(HInvoke* invoke) {
+  CreateFPFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathMinDoubleDouble(HInvoke* invoke) {
+  GenMinMaxFP(invoke->GetLocations(), true, true, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathMinFloatFloat(HInvoke* invoke) {
+  CreateFPFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathMinFloatFloat(HInvoke* invoke) {
+  GenMinMaxFP(invoke->GetLocations(), true, false, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
+  CreateFPFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
+  GenMinMaxFP(invoke->GetLocations(), false, true, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathMaxFloatFloat(HInvoke* invoke) {
+  CreateFPFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathMaxFloatFloat(HInvoke* invoke) {
+  GenMinMaxFP(invoke->GetLocations(), false, false, GetVIXLAssembler());
+}
+
+static void GenMinMax(LocationSummary* locations,
+                      bool is_min,
+                      bool is_long,
+                      vixl::MacroAssembler* masm) {
+  Location op1 = locations->InAt(0);
+  Location op2 = locations->InAt(1);
+  Location out = locations->Out();
+
+  Register op1_reg = is_long ? XRegisterFrom(op1) : WRegisterFrom(op1);
+  Register op2_reg = is_long ? XRegisterFrom(op2) : WRegisterFrom(op2);
+  Register out_reg = is_long ? XRegisterFrom(out) : WRegisterFrom(out);
+
+  __ Cmp(op1_reg, op2_reg);
+  __ Csel(out_reg, op1_reg, op2_reg, is_min ? lt : gt);
+}
+
+static void CreateIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathMinIntInt(HInvoke* invoke) {
+  CreateIntIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathMinIntInt(HInvoke* invoke) {
+  GenMinMax(invoke->GetLocations(), true, false, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathMinLongLong(HInvoke* invoke) {
+  CreateIntIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathMinLongLong(HInvoke* invoke) {
+  GenMinMax(invoke->GetLocations(), true, true, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathMaxIntInt(HInvoke* invoke) {
+  CreateIntIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathMaxIntInt(HInvoke* invoke) {
+  GenMinMax(invoke->GetLocations(), false, false, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathMaxLongLong(HInvoke* invoke) {
+  CreateIntIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathMaxLongLong(HInvoke* invoke) {
+  GenMinMax(invoke->GetLocations(), false, true, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathSqrt(HInvoke* invoke) {
+  CreateFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathSqrt(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Fsqrt(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathCeil(HInvoke* invoke) {
+  CreateFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathCeil(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Frintp(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathFloor(HInvoke* invoke) {
+  CreateFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathFloor(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Frintm(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathRint(HInvoke* invoke) {
+  CreateFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathRint(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Frintn(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
+}
+
+static void CreateFPToIntPlusTempLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+static void GenMathRound(LocationSummary* locations,
+                         bool is_double,
+                         vixl::MacroAssembler* masm) {
+  FPRegister in_reg = is_double ?
+      DRegisterFrom(locations->InAt(0)) : SRegisterFrom(locations->InAt(0));
+  Register out_reg = is_double ?
+      XRegisterFrom(locations->Out()) : WRegisterFrom(locations->Out());
+  UseScratchRegisterScope temps(masm);
+  FPRegister temp1_reg = temps.AcquireSameSizeAs(in_reg);
+
+  // 0.5 can be encoded as an immediate, so use fmov.
+  if (is_double) {
+    __ Fmov(temp1_reg, static_cast<double>(0.5));
+  } else {
+    __ Fmov(temp1_reg, static_cast<float>(0.5));
+  }
+  __ Fadd(temp1_reg, in_reg, temp1_reg);
+  __ Fcvtms(out_reg, temp1_reg);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathRoundDouble(HInvoke* invoke) {
+  CreateFPToIntPlusTempLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathRoundDouble(HInvoke* invoke) {
+  GenMathRound(invoke->GetLocations(), true, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathRoundFloat(HInvoke* invoke) {
+  CreateFPToIntPlusTempLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathRoundFloat(HInvoke* invoke) {
+  GenMathRound(invoke->GetLocations(), false, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMemoryPeekByte(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMemoryPeekByte(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Ldrsb(WRegisterFrom(invoke->GetLocations()->Out()),
+          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Ldr(WRegisterFrom(invoke->GetLocations()->Out()),
+         AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Ldr(XRegisterFrom(invoke->GetLocations()->Out()),
+         AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Ldrsh(WRegisterFrom(invoke->GetLocations()->Out()),
+           AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
+}
+
+static void CreateIntIntToVoidLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMemoryPokeByte(HInvoke* invoke) {
+  CreateIntIntToVoidLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMemoryPokeByte(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Strb(WRegisterFrom(invoke->GetLocations()->InAt(1)),
+          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
+  CreateIntIntToVoidLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Str(WRegisterFrom(invoke->GetLocations()->InAt(1)),
+         AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
+  CreateIntIntToVoidLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Str(XRegisterFrom(invoke->GetLocations()->InAt(1)),
+         AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
+  CreateIntIntToVoidLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  __ Strh(WRegisterFrom(invoke->GetLocations()->InAt(1)),
+          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
+}
+
+void IntrinsicLocationsBuilderARM64::VisitThreadCurrentThread(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitThreadCurrentThread(HInvoke* invoke) {
+  codegen_->Load(Primitive::kPrimNot, WRegisterFrom(invoke->GetLocations()->Out()),
+                 MemOperand(tr, Thread::PeerOffset<8>().Int32Value()));
+}
+
+static void GenUnsafeGet(HInvoke* invoke,
+                         Primitive::Type type,
+                         bool is_volatile,
+                         CodeGeneratorARM64* codegen) {
+  LocationSummary* locations = invoke->GetLocations();
+  DCHECK((type == Primitive::kPrimInt) ||
+         (type == Primitive::kPrimLong) ||
+         (type == Primitive::kPrimNot));
+  vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+  Register base = WRegisterFrom(locations->InAt(1));    // Object pointer.
+  Register offset = XRegisterFrom(locations->InAt(2));  // Long offset.
+  Register trg = RegisterFrom(locations->Out(), type);
+
+  MemOperand mem_op(base.X(), offset);
+  if (is_volatile) {
+    if (kUseAcquireRelease) {
+      codegen->LoadAcquire(invoke, trg, mem_op);
+    } else {
+      codegen->Load(type, trg, mem_op);
+      __ Dmb(InnerShareable, BarrierReads);
+    }
+  } else {
+    codegen->Load(type, trg, mem_op);
+  }
+}
+
+static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetInAt(2, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitUnsafeGet(HInvoke* invoke) {
+  CreateIntIntIntToIntLocations(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafeGetVolatile(HInvoke* invoke) {
+  CreateIntIntIntToIntLocations(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafeGetLong(HInvoke* invoke) {
+  CreateIntIntIntToIntLocations(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
+  CreateIntIntIntToIntLocations(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafeGetObject(HInvoke* invoke) {
+  CreateIntIntIntToIntLocations(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
+  CreateIntIntIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitUnsafeGet(HInvoke* invoke) {
+  GenUnsafeGet(invoke, Primitive::kPrimInt, false, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafeGetVolatile(HInvoke* invoke) {
+  GenUnsafeGet(invoke, Primitive::kPrimInt, true, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafeGetLong(HInvoke* invoke) {
+  GenUnsafeGet(invoke, Primitive::kPrimLong, false, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
+  GenUnsafeGet(invoke, Primitive::kPrimLong, true, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafeGetObject(HInvoke* invoke) {
+  GenUnsafeGet(invoke, Primitive::kPrimNot, false, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
+  GenUnsafeGet(invoke, Primitive::kPrimNot, true, codegen_);
+}
+
+static void CreateIntIntIntIntToVoid(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetInAt(2, Location::RequiresRegister());
+  locations->SetInAt(3, Location::RequiresRegister());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitUnsafePut(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafePutOrdered(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafePutVolatile(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafePutObject(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafePutLong(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
+  CreateIntIntIntIntToVoid(arena_, invoke);
+}
+
+static void GenUnsafePut(LocationSummary* locations,
+                         Primitive::Type type,
+                         bool is_volatile,
+                         bool is_ordered,
+                         CodeGeneratorARM64* codegen) {
+  vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+
+  Register base = WRegisterFrom(locations->InAt(1));    // Object pointer.
+  Register offset = XRegisterFrom(locations->InAt(2));  // Long offset.
+  Register value = RegisterFrom(locations->InAt(3), type);
+
+  MemOperand mem_op(base.X(), offset);
+
+  if (is_volatile || is_ordered) {
+    if (kUseAcquireRelease) {
+      codegen->StoreRelease(type, value, mem_op);
+    } else {
+      __ Dmb(InnerShareable, BarrierAll);
+      codegen->Store(type, value, mem_op);
+      if (is_volatile) {
+        __ Dmb(InnerShareable, BarrierReads);
+      }
+    }
+  } else {
+    codegen->Store(type, value, mem_op);
+  }
+
+  if (type == Primitive::kPrimNot) {
+    codegen->MarkGCCard(base, value);
+  }
+}
+
+void IntrinsicCodeGeneratorARM64::VisitUnsafePut(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, false, false, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafePutOrdered(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, false, true, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafePutVolatile(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, true, false, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafePutObject(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, false, false, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, false, true, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, true, false, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafePutLong(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, false, false, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, false, true, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
+  GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, true, false, codegen_);
+}
+
+static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetInAt(2, Location::RequiresRegister());
+  locations->SetInAt(3, Location::RequiresRegister());
+  locations->SetInAt(4, Location::RequiresRegister());
+
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorARM64* codegen) {
+  // TODO: Currently we use acquire-release load-stores in the CAS loop. One could reasonably write
+  //       a version relying on simple exclusive load-stores and barriers instead.
+  static_assert(kUseAcquireRelease, "Non-acquire-release inlined CAS not implemented, yet.");
+
+  vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+
+  Register out = WRegisterFrom(locations->Out());                  // Boolean result.
+
+  Register base = WRegisterFrom(locations->InAt(1));               // Object pointer.
+  Register offset = XRegisterFrom(locations->InAt(2));             // Long offset.
+  Register expected = RegisterFrom(locations->InAt(3), type);      // Expected.
+  Register value = RegisterFrom(locations->InAt(4), type);         // Value.
+
+  // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
+  if (type == Primitive::kPrimNot) {
+    // Mark card for object assuming new value is stored.
+    codegen->MarkGCCard(base, value);
+  }
+
+  UseScratchRegisterScope temps(masm);
+  Register tmp_ptr = temps.AcquireX();                             // Pointer to actual memory.
+  Register tmp_value = temps.AcquireSameSizeAs(value);             // Value in memory.
+
+  Register tmp_32 = tmp_value.W();
+
+  __ Add(tmp_ptr, base.X(), Operand(offset));
+
+  // do {
+  //   tmp_value = [tmp_ptr] - expected;
+  // } while (tmp_value == 0 && failure([tmp_ptr] <- r_new_value));
+  // result = tmp_value != 0;
+
+  vixl::Label loop_head, exit_loop;
+  __ Bind(&loop_head);
+
+  __ Ldaxr(tmp_value, MemOperand(tmp_ptr));
+  __ Cmp(tmp_value, expected);
+  __ B(&exit_loop, ne);
+
+  __ Stlxr(tmp_32, value, MemOperand(tmp_ptr));
+  __ Cbnz(tmp_32, &loop_head);
+
+  __ Bind(&exit_loop);
+  __ Cset(out, eq);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitUnsafeCASInt(HInvoke* invoke) {
+  CreateIntIntIntIntIntToInt(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafeCASLong(HInvoke* invoke) {
+  CreateIntIntIntIntIntToInt(arena_, invoke);
+}
+void IntrinsicLocationsBuilderARM64::VisitUnsafeCASObject(HInvoke* invoke) {
+  CreateIntIntIntIntIntToInt(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitUnsafeCASInt(HInvoke* invoke) {
+  GenCas(invoke->GetLocations(), Primitive::kPrimInt, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafeCASLong(HInvoke* invoke) {
+  GenCas(invoke->GetLocations(), Primitive::kPrimLong, codegen_);
+}
+void IntrinsicCodeGeneratorARM64::VisitUnsafeCASObject(HInvoke* invoke) {
+  GenCas(invoke->GetLocations(), Primitive::kPrimNot, codegen_);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitStringCharAt(HInvoke* invoke) {
+  // The inputs plus one temp.
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kCallOnSlowPath,
+                                                            kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitStringCharAt(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  // Location of reference to data array
+  const MemberOffset value_offset = mirror::String::ValueOffset();
+  // Location of count
+  const MemberOffset count_offset = mirror::String::CountOffset();
+  // Starting offset within data array
+  const MemberOffset offset_offset = mirror::String::OffsetOffset();
+  // Start of char data with array_
+  const MemberOffset data_offset = mirror::Array::DataOffset(sizeof(uint16_t));
+
+  Register obj = WRegisterFrom(locations->InAt(0));  // String object pointer.
+  Register idx = WRegisterFrom(locations->InAt(1));  // Index of character.
+  Register out = WRegisterFrom(locations->Out());    // Result character.
+
+  UseScratchRegisterScope temps(masm);
+  Register temp = temps.AcquireW();
+  Register array_temp = temps.AcquireW();            // We can trade this for worse scheduling.
+
+  // TODO: Maybe we can support range check elimination. Overall, though, I think it's not worth
+  //       the cost.
+  // TODO: For simplicity, the index parameter is requested in a register, so different from Quick
+  //       we will not optimize the code for constants (which would save a register).
+
+  SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
+  codegen_->AddSlowPath(slow_path);
+
+  __ Ldr(temp, HeapOperand(obj, count_offset));          // temp = str.length.
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
+  __ Cmp(idx, temp);
+  __ B(hs, slow_path->GetEntryLabel());
+
+  // Index computation.
+  __ Ldr(temp, HeapOperand(obj, offset_offset));         // temp := str.offset.
+  __ Ldr(array_temp, HeapOperand(obj, value_offset));    // array_temp := str.offset.
+  __ Add(temp, temp, idx);
+  DCHECK_EQ(data_offset.Int32Value() % 2, 0);            // We'll compensate by shifting.
+  __ Add(temp, temp, Operand(data_offset.Int32Value() / 2));
+
+  // Load the value.
+  __ Ldrh(out, MemOperand(array_temp.X(), temp, UXTW, 1));  // out := array_temp[temp].
+
+  __ Bind(slow_path->GetExitLabel());
+}
+
+// Unimplemented intrinsics.
+
+#define UNIMPLEMENTED_INTRINSIC(Name)                                                  \
+void IntrinsicLocationsBuilderARM64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \
+}                                                                                      \
+void IntrinsicCodeGeneratorARM64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) {    \
+}
+
+UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
+UNIMPLEMENTED_INTRINSIC(StringCompareTo)
+UNIMPLEMENTED_INTRINSIC(StringIsEmpty)  // Might not want to do these two anyways, inlining should
+UNIMPLEMENTED_INTRINSIC(StringLength)   // be good enough here.
+UNIMPLEMENTED_INTRINSIC(StringIndexOf)
+UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
+UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
+
+}  // namespace arm64
+}  // namespace art
diff --git a/compiler/optimizing/intrinsics_arm64.h b/compiler/optimizing/intrinsics_arm64.h
new file mode 100644
index 0000000..ba21889
--- /dev/null
+++ b/compiler/optimizing/intrinsics_arm64.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_INTRINSICS_ARM64_H_
+#define ART_COMPILER_OPTIMIZING_INTRINSICS_ARM64_H_
+
+#include "intrinsics.h"
+
+namespace vixl {
+
+class MacroAssembler;
+
+}  // namespace vixl
+
+namespace art {
+
+class ArenaAllocator;
+class HInvokeStaticOrDirect;
+class HInvokeVirtual;
+
+namespace arm64 {
+
+class CodeGeneratorARM64;
+
+class IntrinsicLocationsBuilderARM64 FINAL : public IntrinsicVisitor {
+ public:
+  explicit IntrinsicLocationsBuilderARM64(ArenaAllocator* arena) : arena_(arena) {}
+
+  // Define visitor methods.
+
+#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+  void Visit ## Name(HInvoke* invoke) OVERRIDE;
+#include "intrinsics_list.h"
+INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
+#undef INTRINSICS_LIST
+#undef OPTIMIZING_INTRINSICS
+
+  // Check whether an invoke is an intrinsic, and if so, create a location summary. Returns whether
+  // a corresponding LocationSummary with the intrinsified_ flag set was generated and attached to
+  // the invoke.
+  bool TryDispatch(HInvoke* invoke);
+
+ private:
+  ArenaAllocator* arena_;
+
+  DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderARM64);
+};
+
+class IntrinsicCodeGeneratorARM64 FINAL : public IntrinsicVisitor {
+ public:
+  explicit IntrinsicCodeGeneratorARM64(CodeGeneratorARM64* codegen) : codegen_(codegen) {}
+
+  // Define visitor methods.
+
+#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+  void Visit ## Name(HInvoke* invoke) OVERRIDE;
+#include "intrinsics_list.h"
+INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
+#undef INTRINSICS_LIST
+#undef OPTIMIZING_INTRINSICS
+
+ private:
+  vixl::MacroAssembler* GetVIXLAssembler();
+
+  ArenaAllocator* GetAllocator();
+
+  CodeGeneratorARM64* codegen_;
+
+  DISALLOW_COPY_AND_ASSIGN(IntrinsicCodeGeneratorARM64);
+};
+
+}  // namespace arm64
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_INTRINSICS_ARM64_H_
diff --git a/compiler/optimizing/intrinsics_list.h b/compiler/optimizing/intrinsics_list.h
index 29ca20c..9cc77c6 100644
--- a/compiler/optimizing/intrinsics_list.h
+++ b/compiler/optimizing/intrinsics_list.h
@@ -69,6 +69,8 @@
   V(UnsafeCASObject, kDirect) \
   V(UnsafeGet, kDirect) \
   V(UnsafeGetVolatile, kDirect) \
+  V(UnsafeGetObject, kDirect) \
+  V(UnsafeGetObjectVolatile, kDirect) \
   V(UnsafeGetLong, kDirect) \
   V(UnsafeGetLongVolatile, kDirect) \
   V(UnsafePut, kDirect) \
@@ -80,8 +82,7 @@
   V(UnsafePutLong, kDirect) \
   V(UnsafePutLongOrdered, kDirect) \
   V(UnsafePutLongVolatile, kDirect) \
-  \
-  V(ReferenceGetReferent, kVirtual)
+  V(ReferenceGetReferent, kDirect)
 
 #endif  // ART_COMPILER_OPTIMIZING_INTRINSICS_LIST_H_
 #undef ART_COMPILER_OPTIMIZING_INTRINSICS_LIST_H_   // #define is only for lint.
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 2c23945..c73f092 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -30,13 +30,11 @@
 
 namespace x86_64 {
 
-static constexpr bool kIntrinsified = true;
-
 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
   return reinterpret_cast<X86_64Assembler*>(codegen_->GetAssembler());
 }
 
-ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetArena() {
+ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
   return codegen_->GetGraph()->GetArena();
 }
 
@@ -644,21 +642,18 @@
   Location temp_loc = locations->GetTemp(0);
   CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
 
-  // Note: Nullcheck has been done before in a HNullCheck before the HInvokeVirtual. If/when we
-  //       move to (coalesced) implicit checks, we have to do a null check below.
-  DCHECK(!kCoalescedImplicitNullCheck);
-
   // TODO: Maybe we can support range check elimination. Overall, though, I think it's not worth
   //       the cost.
   // TODO: For simplicity, the index parameter is requested in a register, so different from Quick
   //       we will not optimize the code for constants (which would save a register).
 
-  SlowPathCodeX86_64* slow_path = new (GetArena()) IntrinsicSlowPathX86_64(invoke);
+  SlowPathCodeX86_64* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
   codegen_->AddSlowPath(slow_path);
 
   X86_64Assembler* assembler = GetAssembler();
 
   __ cmpl(idx, Address(obj, count_offset));
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
   __ j(kAboveEqual, slow_path->GetEntryLabel());
 
   // Get the actual element.
@@ -803,18 +798,25 @@
   GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64WordSize>(), true));
 }
 
-static void GenUnsafeGet(LocationSummary* locations, bool is_long,
+static void GenUnsafeGet(LocationSummary* locations, Primitive::Type type,
                          bool is_volatile ATTRIBUTE_UNUSED, X86_64Assembler* assembler) {
   CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
   CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
   CpuRegister trg = locations->Out().AsRegister<CpuRegister>();
 
-  if (is_long) {
-    __ movq(trg, Address(base, offset, ScaleFactor::TIMES_1, 0));
-  } else {
-    // TODO: Distinguish object. In case we move to an actual compressed heap, retrieving an object
-    //       pointer will entail an unpack operation.
-    __ movl(trg, Address(base, offset, ScaleFactor::TIMES_1, 0));
+  switch (type) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      __ movl(trg, Address(base, offset, ScaleFactor::TIMES_1, 0));
+      break;
+
+    case Primitive::kPrimLong:
+      __ movq(trg, Address(base, offset, ScaleFactor::TIMES_1, 0));
+      break;
+
+    default:
+      LOG(FATAL) << "Unsupported op size " << type;
+      UNREACHABLE();
   }
 }
 
@@ -822,10 +824,10 @@
   LocationSummary* locations = new (arena) LocationSummary(invoke,
                                                            LocationSummary::kNoCall,
                                                            kIntrinsified);
-  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
-  locations->SetOut(Location::SameAsFirstInput());
+  locations->SetOut(Location::RequiresRegister());
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
@@ -840,19 +842,33 @@
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   CreateIntIntIntToIntLocations(arena_, invoke);
 }
+void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
+  CreateIntIntIntToIntLocations(arena_, invoke);
+}
+void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
+  CreateIntIntIntToIntLocations(arena_, invoke);
+}
+
 
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), false, false, GetAssembler());
+  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, false, GetAssembler());
 }
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), false, true, GetAssembler());
+  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, true, GetAssembler());
 }
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), true, false, GetAssembler());
+  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, false, GetAssembler());
 }
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), true, true, GetAssembler());
+  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, true, GetAssembler());
 }
+void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
+  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, false, GetAssembler());
+}
+void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
+  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, true, GetAssembler());
+}
+
 
 static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* arena,
                                                        Primitive::Type type,
@@ -860,7 +876,7 @@
   LocationSummary* locations = new (arena) LocationSummary(invoke,
                                                            LocationSummary::kNoCall,
                                                            kIntrinsified);
-  locations->SetInAt(0, Location::NoLocation());
+  locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
   locations->SetInAt(3, Location::RequiresRegister());
diff --git a/compiler/optimizing/intrinsics_x86_64.h b/compiler/optimizing/intrinsics_x86_64.h
index c1fa99c..dfae7fa 100644
--- a/compiler/optimizing/intrinsics_x86_64.h
+++ b/compiler/optimizing/intrinsics_x86_64.h
@@ -70,7 +70,7 @@
  private:
   X86_64Assembler* GetAssembler();
 
-  ArenaAllocator* GetArena();
+  ArenaAllocator* GetAllocator();
 
   CodeGeneratorX86_64* codegen_;
 
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 6bf8f77..8b06d60 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -446,6 +446,8 @@
   DISALLOW_COPY_AND_ASSIGN(RegisterSet);
 };
 
+static constexpr bool kIntrinsified = true;
+
 /**
  * The code generator computes LocationSummary for each instruction so that
  * the instruction itself knows what code to generate: where to find the inputs