Merge "Quick compiler: fix compile-time perf regression"
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 6135571..188ddb5 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -110,6 +110,12 @@
 DALVIKVM_FLAGS := -Xcompiler-option --compiler-backend=Optimizing
 endif
 
+#
+# Used to change the default GC. Valid values are CMS, SS, GSS. The default is CMS.
+#
+ART_DEFAULT_GC_TYPE ?= CMS
+ART_DEFAULT_GC_TYPE_CFLAGS := -DART_DEFAULT_GC_TYPE_IS_$(ART_DEFAULT_GC_TYPE)
+
 LLVM_ROOT_PATH := external/llvm
 # Don't fail a dalvik minimal host build.
 -include $(LLVM_ROOT_PATH)/llvm.mk
@@ -237,6 +243,7 @@
 
 ART_HOST_CFLAGS := $(art_cflags) -DANDROID_SMP=1 -DART_BASE_ADDRESS=$(LIBART_IMG_HOST_BASE_ADDRESS)
 ART_HOST_CFLAGS += -DART_DEFAULT_INSTRUCTION_SET_FEATURES=default
+ART_HOST_CFLAGS += $(ART_DEFAULT_GC_TYPE_CFLAGS)
 
 ART_TARGET_CFLAGS := $(art_cflags) -DART_TARGET -DART_BASE_ADDRESS=$(LIBART_IMG_TARGET_BASE_ADDRESS)
 ifeq ($(TARGET_CPU_SMP),true)
@@ -244,6 +251,7 @@
 else
   ART_TARGET_CFLAGS += -DANDROID_SMP=0
 endif
+ART_TARGET_CFLAGS += $(ART_DEFAULT_GC_TYPE_CFLAGS)
 
 # DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES is set in ../build/core/dex_preopt.mk based on
 # the TARGET_CPU_VARIANT
diff --git a/compiler/dex/quick/gen_loadstore.cc b/compiler/dex/quick/gen_loadstore.cc
index faa9461..8fcb09b 100644
--- a/compiler/dex/quick/gen_loadstore.cc
+++ b/compiler/dex/quick/gen_loadstore.cc
@@ -139,12 +139,25 @@
 }
 
 RegLocation Mir2Lir::LoadValue(RegLocation rl_src, RegisterClass op_kind) {
-  rl_src = EvalLoc(rl_src, op_kind, false);
-  if (IsInexpensiveConstant(rl_src) || rl_src.location != kLocPhysReg) {
-    LoadValueDirect(rl_src, rl_src.reg);
-    rl_src.location = kLocPhysReg;
-    MarkLive(rl_src);
+  rl_src = UpdateLoc(rl_src);
+  if (rl_src.location == kLocPhysReg) {
+    if (!RegClassMatches(op_kind, rl_src.reg)) {
+      // Wrong register class, realloc, copy and transfer ownership.
+      RegStorage new_reg = AllocTypedTemp(rl_src.fp, op_kind);
+      OpRegCopy(new_reg, rl_src.reg);
+      // Associate the old sreg with the new register and clobber the old register.
+      GetRegInfo(new_reg)->SetSReg(GetRegInfo(rl_src.reg)->SReg());
+      Clobber(rl_src.reg);
+      rl_src.reg = new_reg;
+    }
+    return rl_src;
   }
+
+  DCHECK_NE(rl_src.s_reg_low, INVALID_SREG);
+  rl_src.reg = AllocTypedTemp(rl_src.fp, op_kind);
+  LoadValueDirect(rl_src, rl_src.reg);
+  rl_src.location = kLocPhysReg;
+  MarkLive(rl_src);
   return rl_src;
 }
 
@@ -203,12 +216,26 @@
 
 RegLocation Mir2Lir::LoadValueWide(RegLocation rl_src, RegisterClass op_kind) {
   DCHECK(rl_src.wide);
-  rl_src = EvalLoc(rl_src, op_kind, false);
-  if (IsInexpensiveConstant(rl_src) || rl_src.location != kLocPhysReg) {
-    LoadValueDirectWide(rl_src, rl_src.reg);
-    rl_src.location = kLocPhysReg;
-    MarkLive(rl_src);
+  rl_src = UpdateLocWide(rl_src);
+  if (rl_src.location == kLocPhysReg) {
+    if (!RegClassMatches(op_kind, rl_src.reg)) {
+      // Wrong register class, realloc, copy and transfer ownership.
+      RegStorage new_regs = AllocTypedTempWide(rl_src.fp, op_kind);
+      OpRegCopyWide(new_regs, rl_src.reg);
+      // Associate the old sreg with the new register and clobber the old register.
+      GetRegInfo(new_regs)->SetSReg(GetRegInfo(rl_src.reg)->SReg());
+      Clobber(rl_src.reg);
+      rl_src.reg = new_regs;
+    }
+    return rl_src;
   }
+
+  DCHECK_NE(rl_src.s_reg_low, INVALID_SREG);
+  DCHECK_NE(GetSRegHi(rl_src.s_reg_low), INVALID_SREG);
+  rl_src.reg = AllocTypedTempWide(rl_src.fp, op_kind);
+  LoadValueDirectWide(rl_src, rl_src.reg);
+  rl_src.location = kLocPhysReg;
+  MarkLive(rl_src);
   return rl_src;
 }
 
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index eebd554..3016cd1 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -681,9 +681,9 @@
     RegLocation UpdateRawLoc(RegLocation loc);
 
     /**
-     * @brief Used to load register location into a typed temporary or pair of temporaries.
+     * @brief Used to prepare a register location to receive a wide value.
      * @see EvalLoc
-     * @param loc The register location to load from.
+     * @param loc the location where the value will be stored.
      * @param reg_class Type of register needed.
      * @param update Whether the liveness information should be updated.
      * @return Returns the properly typed temporary in physical register pairs.
@@ -691,8 +691,8 @@
     RegLocation EvalLocWide(RegLocation loc, int reg_class, bool update);
 
     /**
-     * @brief Used to load register location into a typed temporary.
-     * @param loc The register location to load from.
+     * @brief Used to prepare a register location to receive a value.
+     * @param loc the location where the value will be stored.
      * @param reg_class Type of register needed.
      * @param update Whether the liveness information should be updated.
      * @return Returns the properly typed temporary in physical register.
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 5e6e73b..bcc077b 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -954,9 +954,8 @@
   /* If already in registers, we can assume proper form.  Right reg class? */
   if (loc.location == kLocPhysReg) {
     if (!RegClassMatches(reg_class, loc.reg)) {
-      /* Wrong register class.  Reallocate and copy */
+      // Wrong register class.  Reallocate and transfer ownership.
       RegStorage new_regs = AllocTypedTempWide(loc.fp, reg_class);
-      OpRegCopyWide(new_regs, loc.reg);
       // Associate the old sreg with the new register and clobber the old register.
       GetRegInfo(new_regs)->SetSReg(GetRegInfo(loc.reg)->SReg());
       Clobber(loc.reg);
@@ -988,9 +987,8 @@
 
   if (loc.location == kLocPhysReg) {
     if (!RegClassMatches(reg_class, loc.reg)) {
-      /* Wrong register class.  Realloc, copy and transfer ownership */
+      // Wrong register class.  Reallocate and transfer ownership.
       RegStorage new_reg = AllocTypedTemp(loc.fp, reg_class);
-      OpRegCopy(new_reg, loc.reg);
       // Associate the old sreg with the new register and clobber the old register.
       GetRegInfo(new_reg)->SetSReg(GetRegInfo(loc.reg)->SReg());
       Clobber(loc.reg);
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 698fce4..b6e0841 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -1196,7 +1196,7 @@
   if (rl_src.location == kLocPhysReg) {
     // Both operands are in registers.
     // But we must ensure that rl_src is in pair
-    rl_src = EvalLocWide(rl_src, kCoreReg, true);
+    rl_src = LoadValueWide(rl_src, kCoreReg);
     if (rl_dest.reg.GetLowReg() == rl_src.reg.GetHighReg()) {
       // The registers are the same, so we would clobber it before the use.
       RegStorage temp_reg = AllocTemp();
diff --git a/compiler/dex/reg_storage.h b/compiler/dex/reg_storage.h
index df5aa7b..979f516 100644
--- a/compiler/dex/reg_storage.h
+++ b/compiler/dex/reg_storage.h
@@ -22,14 +22,14 @@
 
 /*
  * 16-bit representation of the physical register container holding a Dalvik value.
- * The encoding allows up to 32 physical elements per storage class, and supports eight
+ * The encoding allows up to 64 physical elements per storage class, and supports eight
  * register container shapes.
  *
- * [V] [D] [HHHHH] [SSS] [F] [LLLLL]
+ * [V] [HHHHH] [SSS] [F] [LLLLLL]
  *
- * [LLLLL]
+ * [LLLLLL]
  *  Physical register number for the low or solo register.
- *    0..31
+ *    0..63
  *
  * [F]
  *  Describes type of the [LLLLL] register.
@@ -51,19 +51,13 @@
  *  Physical register number of the high register (valid only for register pair).
  *    0..31
  *
- * [D]
- *  Describes type of the [HHHHH] register (valid only for register pair).
- *    0: Core
- *    1: Floating point
- *
  * [V]
  *    0 -> Invalid
  *    1 -> Valid
  *
  * Note that in all non-invalid cases, we can determine if the storage is floating point
- * by testing bit 6.  Though a mismatch appears to be permitted by the format, the [F][D] values
- * from each half of a pair must match (this allows the high and low regs of a pair to be more
- * easily individually manipulated).
+ * by testing bit 7.  Note also that a register pair is effectively limited to a pair of
+ * physical register numbers in the 0..31 range.
  *
  * On some target architectures, the same underlying physical register container can be given
  * different views.  For example, Arm's 32-bit single-precision floating point registers
@@ -82,30 +76,30 @@
     kValidMask     = 0x8000,
     kValid         = 0x8000,
     kInvalid       = 0x0000,
-    kShapeMask     = 0x01c0,
-    k32BitSolo     = 0x0040,
-    k64BitSolo     = 0x0080,
-    k64BitPair     = 0x00c0,
-    k128BitSolo    = 0x0100,
-    k256BitSolo    = 0x0140,
-    k512BitSolo    = 0x0180,
-    k1024BitSolo   = 0x01c0,
-    k64BitMask     = 0x0180,
-    k64Bits        = 0x0080,
-    kShapeTypeMask = 0x01e0,
-    kFloatingPoint = 0x0020,
+    kShapeMask     = 0x0380,
+    k32BitSolo     = 0x0080,
+    k64BitSolo     = 0x0100,
+    k64BitPair     = 0x0180,
+    k128BitSolo    = 0x0200,
+    k256BitSolo    = 0x0280,
+    k512BitSolo    = 0x0300,
+    k1024BitSolo   = 0x0380,
+    k64BitMask     = 0x0300,
+    k64Bits        = 0x0100,
+    kShapeTypeMask = 0x03c0,
+    kFloatingPoint = 0x0040,
     kCoreRegister  = 0x0000,
   };
 
-  static const uint16_t kRegValMask  = 0x01ff;  // Num, type and shape.
-  static const uint16_t kRegTypeMask = 0x003f;  // Num and type.
-  static const uint16_t kRegNumMask  = 0x001f;  // Num only.
+  static const uint16_t kRegValMask  = 0x03ff;     // Num, type and shape.
+  static const uint16_t kRegTypeMask = 0x007f;     // Num and type.
+  static const uint16_t kRegNumMask  = 0x003f;     // Num only.
+  static const uint16_t kHighRegNumMask = 0x001f;  // 0..31 for high reg
   static const uint16_t kMaxRegs     = kRegValMask + 1;
-  // TODO: deprecate use of kInvalidRegVal and speed up GetReg().
-  static const uint16_t kInvalidRegVal = 0x01ff;
-  static const uint16_t kHighRegShift = 9;
-  static const uint16_t kShapeMaskShift = 6;
-  static const uint16_t kHighRegMask = (kRegTypeMask << kHighRegShift);
+  // TODO: deprecate use of kInvalidRegVal and speed up GetReg().  Rely on valid bit instead.
+  static const uint16_t kInvalidRegVal = 0x03ff;
+  static const uint16_t kHighRegShift = 10;
+  static const uint16_t kHighRegMask = (kHighRegNumMask << kHighRegShift);
 
   // Reg is [F][LLLLL], will override any existing shape and use rs_kind.
   RegStorage(RegStorageKind rs_kind, int reg) {
@@ -116,7 +110,9 @@
   RegStorage(RegStorageKind rs_kind, int low_reg, int high_reg) {
     DCHECK_EQ(rs_kind, k64BitPair);
     DCHECK_EQ(low_reg & kFloatingPoint, high_reg & kFloatingPoint);
-    reg_ = kValid | rs_kind | ((high_reg & kRegTypeMask) << kHighRegShift) | (low_reg & kRegTypeMask);
+    DCHECK_LE(high_reg & kRegNumMask, kHighRegNumMask) << "High reg must be in 0..31";
+    reg_ = kValid | rs_kind | ((high_reg & kHighRegNumMask) << kHighRegShift) |
+        (low_reg & kRegTypeMask);
   }
   constexpr explicit RegStorage(uint16_t val) : reg_(val) {}
   RegStorage() : reg_(kInvalid) {}
@@ -206,7 +202,7 @@
   // Retrieve the most significant register of a pair.
   int GetHighReg() const {
     DCHECK(IsPair());
-    return k32BitSolo | ((reg_ & kHighRegMask) >> kHighRegShift);
+    return k32BitSolo | ((reg_ & kHighRegMask) >> kHighRegShift) | (reg_ & kFloatingPoint);
   }
 
   // Create a stand-alone RegStorage from the high reg of a pair.
@@ -217,7 +213,7 @@
 
   void SetHighReg(int reg) {
     DCHECK(IsPair());
-    reg_ = (reg_ & ~kHighRegMask) | ((reg & kRegTypeMask) << kHighRegShift);
+    reg_ = (reg_ & ~kHighRegMask) | ((reg & kHighRegNumMask) << kHighRegShift);
   }
 
   // Return the register number of low or solo.
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index fb909a8..d03b99f 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -62,18 +62,15 @@
 
   switch (abi) {
     case kInterpreterAbi:  // Thread* is first argument (X0) in interpreter ABI.
-      // FIXME IPx used by VIXL - this is unsafe.
       __ JumpTo(Arm64ManagedRegister::FromCoreRegister(X0), Offset(offset.Int32Value()),
           Arm64ManagedRegister::FromCoreRegister(IP1));
 
       break;
     case kJniAbi:  // Load via Thread* held in JNIEnv* in first argument (X0).
-
       __ LoadRawPtr(Arm64ManagedRegister::FromCoreRegister(IP1),
                       Arm64ManagedRegister::FromCoreRegister(X0),
                       Offset(JNIEnvExt::SelfOffset().Int32Value()));
 
-      // FIXME IPx used by VIXL - this is unsafe.
       __ JumpTo(Arm64ManagedRegister::FromCoreRegister(IP1), Offset(offset.Int32Value()),
                 Arm64ManagedRegister::FromCoreRegister(IP0));
 
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index b4bb979..f486b3c 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -50,11 +50,11 @@
 }
 
 void Arm64Assembler::GetCurrentThread(ManagedRegister tr) {
-  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(TR1));
+  ___ Mov(reg_x(tr.AsArm64().AsCoreRegister()), reg_x(ETR));
 }
 
 void Arm64Assembler::GetCurrentThread(FrameOffset offset, ManagedRegister /* scratch */) {
-  StoreToOffset(TR1, SP, offset.Int32Value());
+  StoreToOffset(ETR, SP, offset.Int32Value());
 }
 
 // See Arm64 PCS Section 5.2.2.1.
@@ -79,11 +79,13 @@
     // VIXL macro-assembler handles all variants.
     ___ Add(reg_x(rd), reg_x(rn), value);
   } else {
-    // ip1 = rd + value
-    // rd = cond ? ip1 : rn
-    CHECK_NE(rn, IP1);
-    ___ Add(reg_x(IP1), reg_x(rn), value);
-    ___ Csel(reg_x(rd), reg_x(IP1), reg_x(rd), COND_OP(cond));
+    // temp = rd + value
+    // rd = cond ? temp : rn
+    vixl::UseScratchRegisterScope temps(vixl_masm_);
+    temps.Exclude(reg_x(rd), reg_x(rn));
+    vixl::Register temp = temps.AcquireX();
+    ___ Add(temp, reg_x(rn), value);
+    ___ Csel(reg_x(rd), temp, reg_x(rd), COND_OP(cond));
   }
 }
 
@@ -162,7 +164,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadImmediate(scratch.AsCoreRegister(), imm);
-  StoreToOffset(scratch.AsCoreRegister(), TR1, offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), ETR, offs.Int32Value());
 }
 
 void Arm64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> tr_offs,
@@ -171,13 +173,14 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   AddConstant(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), ETR, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::StoreStackPointerToThread64(ThreadOffset<8> tr_offs) {
-  // Arm64 does not support: "str sp, [dest]" therefore we use IP1 as a temp reg.
-  ___ Mov(reg_x(IP1), reg_x(SP));
-  StoreToOffset(IP1, TR1, tr_offs.Int32Value());
+  vixl::UseScratchRegisterScope temps(vixl_masm_);
+  vixl::Register temp = temps.AcquireX();
+  ___ Mov(temp, reg_x(SP));
+  ___ Str(temp, MEM_OP(reg_x(ETR), tr_offs.Int32Value()));
 }
 
 void Arm64Assembler::StoreSpanning(FrameOffset dest_off, ManagedRegister m_source,
@@ -195,12 +198,14 @@
   if ((cond == AL) || (cond == NV)) {
     ___ Mov(reg_x(dest), value);
   } else {
-    // ip1 = value
-    // rd = cond ? ip1 : rd
+    // temp = value
+    // rd = cond ? temp : rd
     if (value != 0) {
-      CHECK_NE(dest, IP1);
-      ___ Mov(reg_x(IP1), value);
-      ___ Csel(reg_x(dest), reg_x(IP1), reg_x(dest), COND_OP(cond));
+      vixl::UseScratchRegisterScope temps(vixl_masm_);
+      temps.Exclude(reg_x(dest));
+      vixl::Register temp = temps.AcquireX();
+      ___ Mov(temp, value);
+      ___ Csel(reg_x(dest), temp, reg_x(dest), COND_OP(cond));
     } else {
       ___ Csel(reg_x(dest), reg_x(XZR), reg_x(dest), COND_OP(cond));
     }
@@ -276,7 +281,7 @@
 }
 
 void Arm64Assembler::LoadFromThread64(ManagedRegister m_dst, ThreadOffset<8> src, size_t size) {
-  return Load(m_dst.AsArm64(), TR1, src.Int32Value(), size);
+  return Load(m_dst.AsArm64(), ETR, src.Int32Value(), size);
 }
 
 void Arm64Assembler::LoadRef(ManagedRegister m_dst, FrameOffset offs) {
@@ -298,13 +303,16 @@
   Arm64ManagedRegister dst = m_dst.AsArm64();
   Arm64ManagedRegister base = m_base.AsArm64();
   CHECK(dst.IsCoreRegister() && base.IsCoreRegister());
-  LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
+  // Remove dst and base form the temp list - higher level API uses IP1, IP0.
+  vixl::UseScratchRegisterScope temps(vixl_masm_);
+  temps.Exclude(reg_x(dst.AsCoreRegister()), reg_x(base.AsCoreRegister()));
+  ___ Ldr(reg_x(dst.AsCoreRegister()), MEM_OP(reg_x(base.AsCoreRegister()), offs.Int32Value()));
 }
 
 void Arm64Assembler::LoadRawPtrFromThread64(ManagedRegister m_dst, ThreadOffset<8> offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
-  LoadFromOffset(dst.AsCoreRegister(), TR1, offs.Int32Value());
+  LoadFromOffset(dst.AsCoreRegister(), ETR, offs.Int32Value());
 }
 
 // Copying routines.
@@ -342,7 +350,7 @@
                                           ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), ETR, tr_offs.Int32Value());
   StoreToOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
 }
 
@@ -352,7 +360,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
   LoadFromOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
-  StoreToOffset(scratch.AsCoreRegister(), TR1, tr_offs.Int32Value());
+  StoreToOffset(scratch.AsCoreRegister(), ETR, tr_offs.Int32Value());
 }
 
 void Arm64Assembler::CopyRef(FrameOffset dest, FrameOffset src,
@@ -511,7 +519,10 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(base.IsCoreRegister()) << base;
   CHECK(scratch.IsCoreRegister()) << scratch;
-  LoadFromOffset(scratch.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
+  // Remove base and scratch form the temp list - higher level API uses IP1, IP0.
+  vixl::UseScratchRegisterScope temps(vixl_masm_);
+  temps.Exclude(reg_x(base.AsCoreRegister()), reg_x(scratch.AsCoreRegister()));
+  ___ Ldr(reg_x(scratch.AsCoreRegister()), MEM_OP(reg_x(base.AsCoreRegister()), offs.Int32Value()));
   ___ Br(reg_x(scratch.AsCoreRegister()));
 }
 
@@ -595,13 +606,17 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   Arm64Exception *current_exception = new Arm64Exception(scratch, stack_adjust);
   exception_blocks_.push_back(current_exception);
-  LoadFromOffset(scratch.AsCoreRegister(), TR1, Thread::ExceptionOffset<8>().Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), ETR, Thread::ExceptionOffset<8>().Int32Value());
   ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
   ___ B(current_exception->Entry(), COND_OP(NE));
 }
 
 void Arm64Assembler::EmitExceptionPoll(Arm64Exception *exception) {
-    // Bind exception poll entry.
+  vixl::UseScratchRegisterScope temps(vixl_masm_);
+  temps.Exclude(reg_x(exception->scratch_.AsCoreRegister()));
+  vixl::Register temp = temps.AcquireX();
+
+  // Bind exception poll entry.
   ___ Bind(exception->Entry());
   if (exception->stack_adjust_ != 0) {  // Fix up the frame.
     DecreaseFrameSize(exception->stack_adjust_);
@@ -609,12 +624,14 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), reg_x(exception->scratch_.AsCoreRegister()));
-  LoadFromOffset(IP1, TR1, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
+  ___ Ldr(temp, MEM_OP(reg_x(ETR), QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value()));
 
-  // FIXME: Temporary fix for TR (XSELF).
-  ___ Mov(reg_x(TR), reg_x(TR1));
+  // Move ETR(Callee saved) back to TR(Caller saved) reg. We use ETR on calls
+  // to external functions that might trash TR. We do not need the original
+  // X19 saved in BuildFrame().
+  ___ Mov(reg_x(TR), reg_x(ETR));
 
-  ___ Blr(reg_x(IP1));
+  ___ Blr(temp);
   // Call should never return.
   ___ Brk();
 }
@@ -634,8 +651,10 @@
   CHECK_EQ(callee_save_regs.size(), kCalleeSavedRegsSize);
   ___ PushCalleeSavedRegisters();
 
-  // FIXME: Temporary fix for TR (XSELF).
-  ___ Mov(reg_x(TR1), reg_x(TR));
+  // Move TR(Caller saved) to ETR(Callee saved). The original X19 has been
+  // saved by PushCalleeSavedRegisters(). This way we make sure that TR is not
+  // trashed by native code.
+  ___ Mov(reg_x(ETR), reg_x(TR));
 
   // Increate frame to required size - must be at least space to push Method*.
   CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
@@ -681,8 +700,10 @@
   size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
   DecreaseFrameSize(adjust);
 
-  // FIXME: Temporary fix for TR (XSELF).
-  ___ Mov(reg_x(TR), reg_x(TR1));
+  // We move ETR (Callee Saved) back to TR (Caller Saved) which might have
+  // been trashed in the native call. The original X19 (ETR) is restored as
+  // part of PopCalleeSavedRegisters().
+  ___ Mov(reg_x(TR), reg_x(ETR));
 
   // Pop callee saved and return to LR.
   ___ PopCalleeSavedRegisters();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 97fb93a..583150c 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -85,6 +85,7 @@
   vixl_masm_(new vixl::MacroAssembler(vixl_buf_, kBufferSizeArm64)) {}
 
   virtual ~Arm64Assembler() {
+    delete vixl_masm_;
     delete[] vixl_buf_;
   }
 
@@ -237,8 +238,8 @@
   // Vixl buffer.
   byte* vixl_buf_;
 
-  // Unique ptr - vixl assembler.
-  UniquePtr<vixl::MacroAssembler> vixl_masm_;
+  // Vixl assembler.
+  vixl::MacroAssembler* vixl_masm_;
 
   // List of exception blocks to generate at the end of the code cache.
   std::vector<Arm64Exception*> exception_blocks_;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 8079460..f7cb254 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -443,35 +443,32 @@
     DELIVER_PENDING_EXCEPTION
 .endm
 
-// FIXME: Temporary fix for TR(XSELF).
 .macro NO_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
-    mov x0, x19                        // pass Thread::Current
+    mov x0, xSELF                        // pass Thread::Current
     mov x1, sp                        // pass SP
     b   \cxx_name                     // \cxx_name(Thread*, SP)
 END \c_name
 .endm
 
-// FIXME: Temporary fix for TR(XSELF).
 .macro ONE_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context.
-    mov x1, x19                       // pass Thread::Current.
+    mov x1, xSELF                       // pass Thread::Current.
     mov x2, sp                        // pass SP.
     b   \cxx_name                     // \cxx_name(arg, Thread*, SP).
     brk 0
 END \c_name
 .endm
 
-// FIXME: Temporary fix for TR(XSELF).
 .macro TWO_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
-    mov x2, x19                       // pass Thread::Current
+    mov x2, xSELF                       // pass Thread::Current
     mov x3, sp                        // pass SP
     b   \cxx_name                     // \cxx_name(arg1, arg2, Thread*, SP)
     brk 0
@@ -991,7 +988,6 @@
      * failure.
      */
     .extern artHandleFillArrayDataFromCode
-// TODO: xSELF -> x19.
 ENTRY art_quick_handle_fill_data
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // Save callee saves in case exception allocation triggers GC.
     mov    x2, xSELF                       // Pass Thread::Current.
@@ -1166,12 +1162,7 @@
     brk 0                         // Unreached.
 END art_quick_aput_obj
 
-UNIMPLEMENTED art_quick_initialize_static_storage
-UNIMPLEMENTED art_quick_initialize_type
-UNIMPLEMENTED art_quick_initialize_type_and_verify_access
-
 // Macro to facilitate adding new allocation entrypoints.
-// TODO: xSELF -> x19. Temporarily rely on xSELF being saved in REF_ONLY
 .macro TWO_ARG_DOWNCALL name, entrypoint, return
     .extern \entrypoint
 ENTRY \name
@@ -1186,7 +1177,6 @@
 .endm
 
 // Macro to facilitate adding new array allocation entrypoints.
-// TODO: xSELF -> x19. Temporarily rely on xSELF being saved in REF_ONLY
 .macro THREE_ARG_DOWNCALL name, entrypoint, return
     .extern \entrypoint
 ENTRY \name
@@ -1244,6 +1234,16 @@
 END \name
 .endm
 
+    /*
+     * Entry from managed code when uninitialized static storage, this stub will run the class
+     * initializer and deliver the exception on error. On success the static storage base is
+     * returned.
+     */
+TWO_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO
+
+UNIMPLEMENTED art_quick_initialize_type
+UNIMPLEMENTED art_quick_initialize_type_and_verify_access
+
 ONE_ARG_REF_DOWNCALL art_quick_get32_static, artGet32StaticFromCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
 ONE_ARG_REF_DOWNCALL art_quick_get64_static, artGet64StaticFromCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
 ONE_ARG_REF_DOWNCALL art_quick_get_obj_static, artGetObjStaticFromCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
@@ -1273,8 +1273,13 @@
     RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_set64_static
 
-
-UNIMPLEMENTED art_quick_resolve_string
+    /*
+     * Entry from managed code to resolve a string, this stub will allocate a String and deliver an
+     * exception on error. On success the String is returned. x0 holds the referring method,
+     * w1 holds the string index. The fast path check for hit in strings cache has already been
+     * performed.
+     */
+TWO_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALL_ALLOC_ENTRYPOINTS
@@ -1293,7 +1298,7 @@
     mov     x2, xSELF                   // pass Thread::Current
     mov     x3, sp                      // pass SP
     bl      artQuickProxyInvokeHandler  // (Method* proxy method, receiver, Thread*, SP)
-    ldr  xSELF, [sp, #200]              // Restore self pointer.
+    ldr     xSELF, [sp, #200]           // Restore self pointer.
     ldr     x2, [xSELF, THREAD_EXCEPTION_OFFSET]
     cbnz    x2, .Lexception_in_proxy    // success if no exception is pending
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME_NO_D0 // keep d0
@@ -1308,14 +1313,13 @@
 
 ENTRY art_quick_resolution_trampoline
     SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME
-    mov x19, x0           // save the called method
     mov x2, xSELF
     mov x3, sp
     bl artQuickResolutionTrampoline  // (called, receiver, Thread*, SP)
-    mov x9, x0            // Remember returned code pointer in x9.
-    mov x0, x19           // Restore the method, before x19 is restored to on-call value
+    cbz x0, 1f
+    mov x9, x0              // Remember returned code pointer in x9.
+    ldr x0, [sp, #0]        // artQuickResolutionTrampoline puts called method in *SP.
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
-    cbz x9, 1f
     br x9
 1:
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
diff --git a/runtime/arch/arm64/registers_arm64.h b/runtime/arch/arm64/registers_arm64.h
index 2503918..ea346e0 100644
--- a/runtime/arch/arm64/registers_arm64.h
+++ b/runtime/arch/arm64/registers_arm64.h
@@ -56,8 +56,8 @@
   X29 = 29,
   X30 = 30,
   X31 = 31,
-  TR  = 18,     // ART Thread Register - Needs to be one of the callee saved regs.
-  TR1 = 19,     // FIXME!
+  TR  = 18,     // ART Thread Register - Managed Runtime (Caller Saved Reg)
+  ETR = 19,     // ART Thread Register - External Calls  (Callee Saved Reg)
   IP0 = 16,     // Used as scratch by VIXL.
   IP1 = 17,     // Used as scratch by ART JNI Assembler.
   FP  = 29,
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 1d05540..86f52aa 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -51,6 +51,11 @@
     }
   }
 
+  // Helper function needed since TEST_F makes a new class.
+  Thread::tls_ptr_sized_values* GetTlsPtr(Thread* self) {
+    return &self->tlsPtr_;
+  }
+
   size_t Invoke3(size_t arg0, size_t arg1, size_t arg2, uintptr_t code, Thread* self) {
     // Push a transition back into managed code onto the linked list in thread.
     ManagedStack fragment;
@@ -727,13 +732,6 @@
 #endif
 }
 
-
-#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
-extern "C" void art_quick_alloc_object_rosalloc(void);
-extern "C" void art_quick_alloc_object_resolved_rosalloc(void);
-extern "C" void art_quick_alloc_object_initialized_rosalloc(void);
-#endif
-
 TEST_F(StubTest, AllocObject) {
   TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
 
@@ -751,13 +749,12 @@
   // Play with it...
 
   EXPECT_FALSE(self->IsExceptionPending());
-
   {
     // Use an arbitrary method from c to use as referrer
     size_t result = Invoke3(static_cast<size_t>(c->GetDexTypeIndex()),    // type_idx
                             reinterpret_cast<size_t>(c->GetVirtualMethod(0)),  // arbitrary
                             0U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocObject),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending());
@@ -771,7 +768,7 @@
     // We can use nullptr in the second argument as we do not need a method here (not used in
     // resolved/initialized cases)
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_resolved_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocObjectResolved),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending());
@@ -785,7 +782,7 @@
     // We can use nullptr in the second argument as we do not need a method here (not used in
     // resolved/initialized cases)
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_initialized_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocObjectInitialized),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending());
@@ -842,7 +839,7 @@
     self->ClearException();
 
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_initialized_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocObjectInitialized),
                             self);
 
     EXPECT_TRUE(self->IsExceptionPending());
@@ -866,12 +863,6 @@
 #endif
 }
 
-
-#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
-extern "C" void art_quick_alloc_array_rosalloc(void);
-extern "C" void art_quick_alloc_array_resolved_rosalloc(void);
-#endif
-
 TEST_F(StubTest, AllocObjectArray) {
   TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
 
@@ -902,7 +893,7 @@
     size_t result = Invoke3(static_cast<size_t>(c->GetDexTypeIndex()),    // type_idx
                             reinterpret_cast<size_t>(c_obj->GetVirtualMethod(0)),  // arbitrary
                             10U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocArray),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending());
@@ -917,7 +908,7 @@
     // We can use nullptr in the second argument as we do not need a method here (not used in
     // resolved/initialized cases)
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 10U,
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_resolved_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocArrayResolved),
                             self);
 
     EXPECT_FALSE(self->IsExceptionPending()) << PrettyTypeOf(self->GetException(nullptr));
@@ -937,7 +928,7 @@
   {
     size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr),
                             GB,  // that should fail...
-                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_resolved_rosalloc),
+                            reinterpret_cast<uintptr_t>(GetTlsPtr(self)->quick_entrypoints.pAllocArrayResolved),
                             self);
 
     EXPECT_TRUE(self->IsExceptionPending());
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 84ca23b..ff3e7b6 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -181,8 +181,16 @@
   parallel_gc_threads_ = sysconf(_SC_NPROCESSORS_CONF) - 1;
   // Only the main GC thread, no workers.
   conc_gc_threads_ = 0;
-  // Default is CMS which is Sticky + Partial + Full CMS GC.
+  // The default GC type is set in makefiles.
+#if ART_DEFAULT_GC_TYPE_IS_CMS
   collector_type_ = gc::kCollectorTypeCMS;
+#elif ART_DEFAULT_GC_TYPE_IS_SS
+  collector_type_ = gc::kCollectorTypeSS;
+#elif ART_DEFAULT_GC_TYPE_IS_GSS
+  collector_type_ = gc::kCollectorTypeGSS;
+#else
+#error "ART default GC type must be set"
+#endif
   // If background_collector_type_ is kCollectorTypeNone, it defaults to the collector_type_ after
   // parsing options.
   background_collector_type_ = gc::kCollectorTypeNone;
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index d78be92..5d4bf06 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -929,8 +929,8 @@
 }
 
 void Runtime::VisitRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
-  VisitConcurrentRoots(callback, arg, flags);
   VisitNonConcurrentRoots(callback, arg);
+  VisitConcurrentRoots(callback, arg, flags);
 }
 
 mirror::ObjectArray<mirror::ArtMethod>* Runtime::CreateDefaultImt(ClassLinker* cl) {
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 5e64e59..6667419 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -270,7 +270,7 @@
 void StackVisitor::SanityCheckFrame() const {
   if (kIsDebugBuild) {
     mirror::ArtMethod* method = GetMethod();
-    CHECK(method->GetClass() == mirror::ArtMethod::GetJavaLangReflectArtMethod());
+    CHECK_EQ(method->GetClass(), mirror::ArtMethod::GetJavaLangReflectArtMethod());
     if (cur_quick_frame_ != nullptr) {
       method->AssertPcIsWithinQuickCode(cur_quick_frame_pc_);
       // Frame sanity.
diff --git a/runtime/thread.h b/runtime/thread.h
index 8c17082..32311e1 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1082,6 +1082,7 @@
   friend class Runtime;  // For CreatePeer.
   friend class ScopedThreadStateChange;
   friend class SignalCatcher;  // For SetStateUnsafe.
+  friend class StubTest;  // For accessing entrypoints.
   friend class ThreadList;  // For ~Thread and Destroy.
 
   DISALLOW_COPY_AND_ASSIGN(Thread);