Merge "ART: fix NullCheckElimination to preserve MIR_IGNORE_NULL_CHECK"
diff --git a/Android.mk b/Android.mk
index 1edd543..9a813fd 100644
--- a/Android.mk
+++ b/Android.mk
@@ -329,9 +329,9 @@
 .PHONY: oat-target-$(1)
 oat-target-$(1): $$(OUT_OAT_FILE)
 
-$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY)
+$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OAT_DEPENDENCY)
 	@mkdir -p $$(dir $$@)
-	$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
+	$(DEX2OAT) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
 		--boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) \
 		--dex-location=/$(1) --oat-file=$$@ \
 		--instruction-set=$(DEX2OAT_TARGET_ARCH) \
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index 1c462eb..6ef451f 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -28,10 +28,10 @@
 # NB depending on HOST_CORE_DEX_LOCATIONS so we are sure to have the dex files in frameworks for
 # run-test --no-image
 define create-core-oat-host-rules
-$$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_LOCATIONS) $$(DEX2OATD_DEPENDENCY)
+$$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_LOCATIONS) $$(DEX2OAT_DEPENDENCY)
 	@echo "host dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
-	$$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_IMAGE_XMS) --runtime-arg -Xmx$(DEX2OAT_IMAGE_XMX) \
+	$$(hide) $$(DEX2OAT) --runtime-arg -Xms$(DEX2OAT_IMAGE_XMS) --runtime-arg -Xmx$(DEX2OAT_IMAGE_XMX) \
 	  --image-classes=$$(PRELOADED_CLASSES) $$(addprefix --dex-file=,$$(HOST_CORE_DEX_FILES)) \
 	  $$(addprefix --dex-location=,$$(HOST_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)HOST_CORE_OAT_OUT) \
 	  --oat-location=$$($(1)HOST_CORE_OAT) --image=$$($(1)HOST_CORE_IMG_OUT) \
@@ -51,10 +51,10 @@
 endif
 
 define create-core-oat-target-rules
-$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
+$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OAT_DEPENDENCY)
 	@echo "target dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
-	$$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
+	$$(hide) $$(DEX2OAT) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
 	  --image-classes=$$(PRELOADED_CLASSES) $$(addprefix --dex-file=,$$(TARGET_CORE_DEX_FILES)) \
 	  $$(addprefix --dex-location=,$$(TARGET_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)TARGET_CORE_OAT_OUT) \
 	  --oat-location=$$($(1)TARGET_CORE_OAT) --image=$$($(1)TARGET_CORE_IMG_OUT) \
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 133044a..edc5bd0 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -100,6 +100,7 @@
 	optimizing/nodes.cc \
 	optimizing/optimizing_compiler.cc \
 	optimizing/parallel_move_resolver.cc \
+	optimizing/prepare_for_register_allocation.cc \
 	optimizing/register_allocator.cc \
 	optimizing/ssa_builder.cc \
 	optimizing/ssa_liveness_analysis.cc \
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index bcbfb5a..276b886 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -178,7 +178,7 @@
                                  BasicBlock* orig_block, BasicBlock** immed_pred_block_p) {
   DCHECK_GT(code_offset, orig_block->start_offset);
   MIR* insn = orig_block->first_mir_insn;
-  MIR* prev = NULL;
+  MIR* prev = NULL;  // Will be set to instruction before split.
   while (insn) {
     if (insn->offset == code_offset) break;
     prev = insn;
@@ -187,6 +187,10 @@
   if (insn == NULL) {
     LOG(FATAL) << "Break split failed";
   }
+  // Now insn is at the instruction where we want to split, namely
+  // insn will be the first instruction of the "bottom" block.
+  // Similarly, prev will be the last instruction of the "top" block
+
   BasicBlock* bottom_block = CreateNewBB(kDalvikByteCode);
 
   bottom_block->start_offset = code_offset;
@@ -259,7 +263,10 @@
   DCHECK(static_cast<int>(insn->dalvikInsn.opcode) == kMirOpCheck ||
          !MIR::DecodedInstruction::IsPseudoMirOp(insn->dalvikInsn.opcode));
   DCHECK_EQ(dex_pc_to_block_map_[insn->offset], orig_block->id);
+  // Scan the "bottom" instructions, remapping them to the
+  // newly created "bottom" block.
   MIR* p = insn;
+  p->bb = bottom_block->id;
   dex_pc_to_block_map_[p->offset] = bottom_block->id;
   while (p != bottom_block->last_mir_insn) {
     p = p->next;
@@ -273,7 +280,11 @@
      * the first in a BasicBlock, we can't hit it here.
      */
     if ((opcode == kMirOpCheck) || !MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
-      DCHECK_EQ(dex_pc_to_block_map_[p->offset], orig_block->id);
+      BasicBlockId mapped_id = dex_pc_to_block_map_[p->offset];
+      // At first glance the instructions should all be mapped to orig_block.
+      // However, multiple instructions may correspond to the same dex, hence an earlier
+      // instruction may have already moved the mapping for dex to bottom_block.
+      DCHECK((mapped_id == orig_block->id) || (mapped_id == bottom_block->id));
       dex_pc_to_block_map_[p->offset] = bottom_block->id;
     }
   }
@@ -1537,7 +1548,8 @@
     return GetSSAName(ssa_reg);
   }
   if (IsConst(reg_location_[ssa_reg])) {
-    if (!singles_only && reg_location_[ssa_reg].wide) {
+    if (!singles_only && reg_location_[ssa_reg].wide &&
+        !reg_location_[ssa_reg].high_word) {
       return StringPrintf("v%d_%d#0x%" PRIx64, SRegToVReg(ssa_reg), GetSSASubscript(ssa_reg),
                           ConstantValueWide(reg_location_[ssa_reg]));
     } else {
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index fda6ef8..35dae00 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -186,6 +186,10 @@
 }
 
 static SelectInstructionKind SelectKind(MIR* mir) {
+  // Work with the case when mir is nullptr.
+  if (mir == nullptr) {
+    return kSelectNone;
+  }
   switch (mir->dalvikInsn.opcode) {
     case Instruction::MOVE:
     case Instruction::MOVE_OBJECT:
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index ab71921..a87b06a 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -117,6 +117,7 @@
 #define IS_SIGNED_IMM14(value) IS_SIGNED_IMM(14, value)
 #define IS_SIGNED_IMM19(value) IS_SIGNED_IMM(19, value)
 #define IS_SIGNED_IMM21(value) IS_SIGNED_IMM(21, value)
+#define IS_SIGNED_IMM26(value) IS_SIGNED_IMM(26, value)
 
 // Quick macro used to define the registers.
 #define A64_REGISTER_CODE_LIST(R) \
@@ -240,6 +241,7 @@
   kA64B2ct,          // b.cond [01010100] imm_19[23-5] [0] cond[3-0].
   kA64Blr1x,         // blr [1101011000111111000000] rn[9-5] [00000].
   kA64Br1x,          // br  [1101011000011111000000] rn[9-5] [00000].
+  kA64Bl1t,          // bl  [100101] imm26[25-0].
   kA64Brk1d,         // brk [11010100001] imm_16[20-5] [00000].
   kA64B1t,           // b   [00010100] offset_26[25-0].
   kA64Cbnz2rt,       // cbnz[00110101] imm_19[23-5] rt[4-0].
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index b1cf279..7c663a9 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -155,6 +155,10 @@
                  kFmtRegX, 9, 5, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_UNARY_OP | REG_USE0 | IS_BRANCH,
                  "br", "!0x", kFixupNone),
+    ENCODING_MAP(kA64Bl1t, NO_VARIANTS(0x94000000),
+                 kFmtBitBlt, 25, 0, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_UNARY_OP | IS_BRANCH | REG_DEF_LR | NEEDS_FIXUP,
+                 "bl", "!0T", kFixupLabel),
     ENCODING_MAP(kA64Brk1d, NO_VARIANTS(0xd4200000),
                  kFmtBitBlt, 20, 5, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_UNARY_OP | IS_BRANCH,
@@ -873,7 +877,7 @@
               ((target_lir->flags.generation == lir->flags.generation) ? 0 : offset_adjustment);
           int32_t delta = target - pc;
           DCHECK_EQ(delta & 0x3, 0);
-          if (!IS_SIGNED_IMM19(delta >> 2)) {
+          if (!IS_SIGNED_IMM26(delta >> 2)) {
             LOG(FATAL) << "Invalid jump range in kFixupT1Branch";
           }
           lir->operands[0] = delta >> 2;
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 6081f28..e8de876 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -21,6 +21,8 @@
 #include "dex/quick/mir_to_lir-inl.h"
 #include "gc/accounting/card_table.h"
 #include "entrypoints/quick/quick_entrypoints.h"
+#include "mirror/art_method.h"
+#include "mirror/object_array-inl.h"
 
 namespace art {
 
@@ -433,4 +435,117 @@
   NewLIR0(kA64Ret);
 }
 
+static bool Arm64UseRelativeCall(CompilationUnit* cu, const MethodReference& target_method) {
+  // Always emit relative calls.
+  return true;
+}
+
+/*
+ * Bit of a hack here - in the absence of a real scheduling pass,
+ * emit the next instruction in static & direct invoke sequences.
+ */
+static int Arm64NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
+                               int state, const MethodReference& target_method,
+                               uint32_t unused,
+                               uintptr_t direct_code, uintptr_t direct_method,
+                               InvokeType type) {
+  Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
+  if (direct_code != 0 && direct_method != 0) {
+    switch (state) {
+    case 0:  // Get the current Method* [sets kArg0]
+      if (direct_code != static_cast<uintptr_t>(-1)) {
+        cg->LoadConstant(cg->TargetPtrReg(kInvokeTgt), direct_code);
+      } else if (Arm64UseRelativeCall(cu, target_method)) {
+        // Defer to linker patch.
+      } else {
+        cg->LoadCodeAddress(target_method, type, kInvokeTgt);
+      }
+      if (direct_method != static_cast<uintptr_t>(-1)) {
+        cg->LoadConstant(cg->TargetReg(kArg0, kRef), direct_method);
+      } else {
+        cg->LoadMethodAddress(target_method, type, kArg0);
+      }
+      break;
+    default:
+      return -1;
+    }
+  } else {
+    RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
+    switch (state) {
+    case 0:  // Get the current Method* [sets kArg0]
+      // TUNING: we can save a reg copy if Method* has been promoted.
+      cg->LoadCurrMethodDirect(arg0_ref);
+      break;
+    case 1:  // Get method->dex_cache_resolved_methods_
+      cg->LoadRefDisp(arg0_ref,
+                      mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(),
+                      arg0_ref,
+                      kNotVolatile);
+      // Set up direct code if known.
+      if (direct_code != 0) {
+        if (direct_code != static_cast<uintptr_t>(-1)) {
+          cg->LoadConstant(cg->TargetPtrReg(kInvokeTgt), direct_code);
+        } else if (Arm64UseRelativeCall(cu, target_method)) {
+          // Defer to linker patch.
+        } else {
+          CHECK_LT(target_method.dex_method_index, target_method.dex_file->NumMethodIds());
+          cg->LoadCodeAddress(target_method, type, kInvokeTgt);
+        }
+      }
+      break;
+    case 2:  // Grab target method*
+      CHECK_EQ(cu->dex_file, target_method.dex_file);
+      cg->LoadRefDisp(arg0_ref,
+                      mirror::ObjectArray<mirror::Object>::OffsetOfElement(
+                          target_method.dex_method_index).Int32Value(),
+                      arg0_ref,
+                      kNotVolatile);
+      break;
+    case 3:  // Grab the code from the method*
+      if (direct_code == 0) {
+        // kInvokeTgt := arg0_ref->entrypoint
+        cg->LoadWordDisp(arg0_ref,
+                         mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value(),
+                         cg->TargetPtrReg(kInvokeTgt));
+      }
+      break;
+    default:
+      return -1;
+    }
+  }
+  return state + 1;
+}
+
+NextCallInsn Arm64Mir2Lir::GetNextSDCallInsn() {
+  return Arm64NextSDCallInsn;
+}
+
+LIR* Arm64Mir2Lir::CallWithLinkerFixup(const MethodReference& target_method, InvokeType type) {
+  // For ARM64, just generate a relative BL instruction that will be filled in at 'link time'.
+  // If the target turns out to be too far, the linker will generate a thunk for dispatch.
+  int target_method_idx = target_method.dex_method_index;
+  const DexFile* target_dex_file = target_method.dex_file;
+
+  // Generate the call instruction and save index, dex_file, and type.
+  // NOTE: Method deduplication takes linker patches into account, so we can just pass 0
+  // as a placeholder for the offset.
+  LIR* call = RawLIR(current_dalvik_offset_, kA64Bl1t, 0,
+                     target_method_idx, WrapPointer(const_cast<DexFile*>(target_dex_file)), type);
+  AppendLIR(call);
+  call_method_insns_.push_back(call);
+  return call;
+}
+
+LIR* Arm64Mir2Lir::GenCallInsn(const MirMethodLoweringInfo& method_info) {
+  LIR* call_insn;
+  if (method_info.FastPath() && Arm64UseRelativeCall(cu_, method_info.GetTargetMethod()) &&
+      (method_info.GetSharpType() == kDirect || method_info.GetSharpType() == kStatic) &&
+      method_info.DirectCode() == static_cast<uintptr_t>(-1)) {
+    call_insn = CallWithLinkerFixup(method_info.GetTargetMethod(), method_info.GetSharpType());
+  } else {
+    call_insn = OpReg(kOpBlx, TargetPtrReg(kInvokeTgt));
+  }
+  return call_insn;
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 55cc938..93d9b34 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -259,6 +259,28 @@
 
   size_t GetInstructionOffset(LIR* lir) OVERRIDE;
 
+  NextCallInsn GetNextSDCallInsn() OVERRIDE;
+
+  /*
+   * @brief Generate a relative call to the method that will be patched at link time.
+   * @param target_method The MethodReference of the method to be invoked.
+   * @param type How the method will be invoked.
+   * @returns Call instruction
+   */
+  LIR* CallWithLinkerFixup(const MethodReference& target_method, InvokeType type);
+
+  /*
+   * @brief Generate the actual call insn based on the method info.
+   * @param method_info the lowering info for the method call.
+   * @returns Call instruction
+   */
+  virtual LIR* GenCallInsn(const MirMethodLoweringInfo& method_info) OVERRIDE;
+
+  /*
+   * @brief Handle ARM specific literals.
+   */
+  void InstallLiteralPools() OVERRIDE;
+
   LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
 
  private:
@@ -396,6 +418,8 @@
 
   InToRegStorageMapping in_to_reg_storage_mapping_;
   static const A64EncodingMap EncodingMap[kA64Last];
+
+  ArenaVector<LIR*> call_method_insns_;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 0462530..ba47883 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -585,7 +585,8 @@
 }
 
 Arm64Mir2Lir::Arm64Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
-    : Mir2Lir(cu, mir_graph, arena) {
+    : Mir2Lir(cu, mir_graph, arena),
+      call_method_insns_(arena->Adapter()) {
   // Sanity check - make sure encoding map lines up.
   for (int i = 0; i < kA64Last; i++) {
     if (UNWIDE(Arm64Mir2Lir::EncodingMap[i].opcode) != i) {
@@ -1201,4 +1202,21 @@
   return call_state;
 }
 
+void Arm64Mir2Lir::InstallLiteralPools() {
+  // PC-relative calls to methods.
+  patches_.reserve(call_method_insns_.size());
+  for (LIR* p : call_method_insns_) {
+      DCHECK_EQ(p->opcode, kA64Bl1t);
+      uint32_t target_method_idx = p->operands[1];
+      const DexFile* target_dex_file =
+          reinterpret_cast<const DexFile*>(UnwrapPointer(p->operands[2]));
+
+      patches_.push_back(LinkerPatch::RelativeCodePatch(p->offset,
+                                                        target_dex_file, target_method_idx));
+  }
+
+  // And do the normal processing.
+  Mir2Lir::InstallLiteralPools();
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index bafb57d..408c73d 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -495,7 +495,8 @@
                           uintptr_t direct_code, uintptr_t direct_method,
                           InvokeType type) {
   DCHECK(cu->instruction_set != kX86 && cu->instruction_set != kX86_64 &&
-         cu->instruction_set != kThumb2 && cu->instruction_set != kArm);
+         cu->instruction_set != kThumb2 && cu->instruction_set != kArm &&
+         cu->instruction_set != kArm64);
   Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
   if (direct_code != 0 && direct_method != 0) {
     switch (state) {
@@ -1751,7 +1752,8 @@
 
 LIR* Mir2Lir::GenCallInsn(const MirMethodLoweringInfo& method_info) {
   DCHECK(cu_->instruction_set != kX86 && cu_->instruction_set != kX86_64 &&
-         cu_->instruction_set != kThumb2 && cu_->instruction_set != kArm);
+         cu_->instruction_set != kThumb2 && cu_->instruction_set != kArm &&
+         cu_->instruction_set != kArm64);
   return OpReg(kOpBlx, TargetPtrReg(kInvokeTgt));
 }
 
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index e7160ad..0ac1299 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -1235,6 +1235,7 @@
       // Combine check and work halves of throwing instruction.
       MIR* work_half = mir->meta.throw_insn;
       mir->dalvikInsn.opcode = work_half->dalvikInsn.opcode;
+      mir->optimization_flags = work_half->optimization_flags;
       mir->meta = work_half->meta;  // Whatever the work_half had, we need to copy it.
       opcode = work_half->dalvikInsn.opcode;
       SSARepresentation* ssa_rep = work_half->ssa_rep;
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index dd64368..e64d2ab 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -38,6 +38,7 @@
 #include "scoped_thread_state_change.h"
 #include "handle_scope-inl.h"
 #include "utils/arm/assembler_thumb2.h"
+#include "utils/arm64/assembler_arm64.h"
 #include "verifier/method_verifier.h"
 
 namespace art {
@@ -117,10 +118,14 @@
   DISALLOW_COPY_AND_ASSIGN(X86RelativeCallPatcher);
 };
 
-class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
+class OatWriter::ArmBaseRelativeCallPatcher : public RelativeCallPatcher {
  public:
-  explicit Thumb2RelativeCallPatcher(OatWriter* writer)
-      : writer_(writer), thunk_code_(CompileThunkCode()),
+  ArmBaseRelativeCallPatcher(OatWriter* writer,
+                             InstructionSet instruction_set, std::vector<uint8_t> thunk_code,
+                             uint32_t max_positive_displacement, uint32_t max_negative_displacement)
+      : writer_(writer), instruction_set_(instruction_set), thunk_code_(thunk_code),
+        max_positive_displacement_(max_positive_displacement),
+        max_negative_displacement_(max_negative_displacement),
         thunk_locations_(), current_thunk_to_write_(0u), unprocessed_patches_() {
   }
 
@@ -130,11 +135,11 @@
     // of code. To avoid any alignment discrepancies for the final chunk, we always align the
     // offset after reserving of writing any chunk.
     if (UNLIKELY(compiled_method == nullptr)) {
-      uint32_t aligned_offset = CompiledMethod::AlignCode(offset, kThumb2);
+      uint32_t aligned_offset = CompiledMethod::AlignCode(offset, instruction_set_);
       bool needs_thunk = ReserveSpaceProcessPatches(aligned_offset);
       if (needs_thunk) {
         thunk_locations_.push_back(aligned_offset);
-        offset = CompiledMethod::AlignCode(aligned_offset + thunk_code_.size(), kThumb2);
+        offset = CompiledMethod::AlignCode(aligned_offset + thunk_code_.size(), instruction_set_);
       }
       return offset;
     }
@@ -143,14 +148,14 @@
     uint32_t quick_code_offset = compiled_method->AlignCode(offset) + sizeof(OatQuickMethodHeader);
     uint32_t next_aligned_offset = compiled_method->AlignCode(quick_code_offset + quick_code_size);
     if (!unprocessed_patches_.empty() &&
-        next_aligned_offset - unprocessed_patches_.front().second > kMaxPositiveDisplacement) {
+        next_aligned_offset - unprocessed_patches_.front().second > max_positive_displacement_) {
       bool needs_thunk = ReserveSpaceProcessPatches(next_aligned_offset);
       if (needs_thunk) {
         // A single thunk will cover all pending patches.
         unprocessed_patches_.clear();
         uint32_t thunk_location = compiled_method->AlignCode(offset);
         thunk_locations_.push_back(thunk_location);
-        offset = CompiledMethod::AlignCode(thunk_location + thunk_code_.size(), kThumb2);
+        offset = CompiledMethod::AlignCode(thunk_location + thunk_code_.size(), instruction_set_);
       }
     }
     for (const LinkerPatch& patch : compiled_method->GetPatches()) {
@@ -166,7 +171,7 @@
     if (current_thunk_to_write_ == thunk_locations_.size()) {
       return offset;
     }
-    uint32_t aligned_offset = CompiledMethod::AlignCode(offset, kThumb2);
+    uint32_t aligned_offset = CompiledMethod::AlignCode(offset, instruction_set_);
     if (UNLIKELY(aligned_offset == thunk_locations_[current_thunk_to_write_])) {
       ++current_thunk_to_write_;
       uint32_t aligned_code_delta = aligned_offset - offset;
@@ -179,7 +184,7 @@
       writer_->size_relative_call_thunks_ += thunk_code_.size();
       uint32_t thunk_end_offset = aligned_offset + thunk_code_.size();
       // Align after writing chunk, see the ReserveSpace() above.
-      offset = CompiledMethod::AlignCode(thunk_end_offset, kThumb2);
+      offset = CompiledMethod::AlignCode(thunk_end_offset, instruction_set_);
       aligned_code_delta = offset - thunk_end_offset;
       if (aligned_code_delta != 0u && !writer_->WriteCodeAlignment(out, aligned_code_delta)) {
         return 0u;
@@ -188,30 +193,88 @@
     return offset;
   }
 
-  void Patch(std::vector<uint8_t>* code, uint32_t literal_offset, uint32_t patch_offset,
-             uint32_t target_offset) OVERRIDE {
-    DCHECK_LE(literal_offset + 4u, code->size());
-    DCHECK_EQ(literal_offset & 1u, 0u);
-    DCHECK_EQ(patch_offset & 1u, 0u);
-    DCHECK_EQ(target_offset & 1u, 1u);  // Thumb2 mode bit.
+ protected:
+  uint32_t CalculateDisplacement(uint32_t patch_offset, uint32_t target_offset) {
     // Unsigned arithmetic with its well-defined overflow behavior is just fine here.
-    uint32_t displacement = target_offset - 1u - patch_offset;
+    uint32_t displacement = target_offset - patch_offset;
     // NOTE: With unsigned arithmetic we do mean to use && rather than || below.
-    if (displacement > kMaxPositiveDisplacement && displacement < -kMaxNegativeDisplacement) {
+    if (displacement > max_positive_displacement_ && displacement < -max_negative_displacement_) {
       // Unwritten thunks have higher offsets, check if it's within range.
       DCHECK(current_thunk_to_write_ == thunk_locations_.size() ||
              thunk_locations_[current_thunk_to_write_] > patch_offset);
       if (current_thunk_to_write_ != thunk_locations_.size() &&
-          thunk_locations_[current_thunk_to_write_] - patch_offset < kMaxPositiveDisplacement) {
+          thunk_locations_[current_thunk_to_write_] - patch_offset < max_positive_displacement_) {
         displacement = thunk_locations_[current_thunk_to_write_] - patch_offset;
       } else {
         // We must have a previous thunk then.
         DCHECK_NE(current_thunk_to_write_, 0u);
         DCHECK_LT(thunk_locations_[current_thunk_to_write_ - 1], patch_offset);
         displacement = thunk_locations_[current_thunk_to_write_ - 1] - patch_offset;
-        DCHECK(displacement >= -kMaxNegativeDisplacement);
+        DCHECK(displacement >= -max_negative_displacement_);
       }
     }
+    return displacement;
+  }
+
+ private:
+  bool ReserveSpaceProcessPatches(uint32_t next_aligned_offset) {
+    // Process as many patches as possible, stop only on unresolved targets or calls too far back.
+    while (!unprocessed_patches_.empty()) {
+      uint32_t patch_offset = unprocessed_patches_.front().second;
+      auto it = writer_->method_offset_map_.find(unprocessed_patches_.front().first);
+      if (it == writer_->method_offset_map_.end()) {
+        // If still unresolved, check if we have a thunk within range.
+        DCHECK(thunk_locations_.empty() || thunk_locations_.back() <= patch_offset);
+        if (thunk_locations_.empty() ||
+            patch_offset - thunk_locations_.back() > max_negative_displacement_) {
+          return next_aligned_offset - patch_offset > max_positive_displacement_;
+        }
+      } else if (it->second >= patch_offset) {
+        DCHECK_LE(it->second - patch_offset, max_positive_displacement_);
+      } else {
+        // When calling back, check if we have a thunk that's closer than the actual target.
+        uint32_t target_offset = (thunk_locations_.empty() || it->second > thunk_locations_.back())
+            ? it->second
+            : thunk_locations_.back();
+        DCHECK_GT(patch_offset, target_offset);
+        if (patch_offset - target_offset > max_negative_displacement_) {
+          return true;
+        }
+      }
+      unprocessed_patches_.pop_front();
+    }
+    return false;
+  }
+
+  OatWriter* const writer_;
+  const InstructionSet instruction_set_;
+  const std::vector<uint8_t> thunk_code_;
+  const uint32_t max_positive_displacement_;
+  const uint32_t max_negative_displacement_;
+  std::vector<uint32_t> thunk_locations_;
+  size_t current_thunk_to_write_;
+
+  // ReserveSpace() tracks unprocessed patches.
+  typedef std::pair<MethodReference, uint32_t> UnprocessedPatch;
+  std::deque<UnprocessedPatch> unprocessed_patches_;
+
+  DISALLOW_COPY_AND_ASSIGN(ArmBaseRelativeCallPatcher);
+};
+
+class OatWriter::Thumb2RelativeCallPatcher FINAL : public ArmBaseRelativeCallPatcher {
+ public:
+  explicit Thumb2RelativeCallPatcher(OatWriter* writer)
+      : ArmBaseRelativeCallPatcher(writer, kThumb2, CompileThunkCode(),
+                                   kMaxPositiveDisplacement, kMaxNegativeDisplacement) {
+  }
+
+  void Patch(std::vector<uint8_t>* code, uint32_t literal_offset, uint32_t patch_offset,
+             uint32_t target_offset) OVERRIDE {
+    DCHECK_LE(literal_offset + 4u, code->size());
+    DCHECK_EQ(literal_offset & 1u, 0u);
+    DCHECK_EQ(patch_offset & 1u, 0u);
+    DCHECK_EQ(target_offset & 1u, 1u);  // Thumb2 mode bit.
+    uint32_t displacement = CalculateDisplacement(patch_offset, target_offset & ~1u);
     displacement -= kPcDisplacement;  // The base PC is at the end of the 4-byte patch.
     DCHECK_EQ(displacement & 1u, 0u);
     DCHECK((displacement >> 24) == 0u || (displacement >> 24) == 255u);  // 25-bit signed.
@@ -237,35 +300,6 @@
   }
 
  private:
-  bool ReserveSpaceProcessPatches(uint32_t next_aligned_offset) {
-    // Process as many patches as possible, stop only on unresolved targets or calls too far back.
-    while (!unprocessed_patches_.empty()) {
-      uint32_t patch_offset = unprocessed_patches_.front().second;
-      auto it = writer_->method_offset_map_.find(unprocessed_patches_.front().first);
-      if (it == writer_->method_offset_map_.end()) {
-        // If still unresolved, check if we have a thunk within range.
-        DCHECK(thunk_locations_.empty() || thunk_locations_.back() <= patch_offset);
-        if (thunk_locations_.empty() ||
-            patch_offset - thunk_locations_.back() > kMaxNegativeDisplacement) {
-          return next_aligned_offset - patch_offset > kMaxPositiveDisplacement;
-        }
-      } else if (it->second >= patch_offset) {
-        DCHECK_LE(it->second - patch_offset, kMaxPositiveDisplacement);
-      } else {
-        // When calling back, check if we have a thunk that's closer than the actual target.
-        uint32_t target_offset = (thunk_locations_.empty() || it->second > thunk_locations_.back())
-            ? it->second
-            : thunk_locations_.back();
-        DCHECK_GT(patch_offset, target_offset);
-        if (patch_offset - target_offset > kMaxNegativeDisplacement) {
-          return true;
-        }
-      }
-      unprocessed_patches_.pop_front();
-    }
-    return false;
-  }
-
   static std::vector<uint8_t> CompileThunkCode() {
     // The thunk just uses the entry point in the ArtMethod. This works even for calls
     // to the generic JNI and interpreter trampolines.
@@ -289,18 +323,60 @@
   static constexpr uint32_t kMaxPositiveDisplacement = (1u << 24) - 2 + kPcDisplacement;
   static constexpr uint32_t kMaxNegativeDisplacement = (1u << 24) - kPcDisplacement;
 
-  OatWriter* const writer_;
-  const std::vector<uint8_t> thunk_code_;
-  std::vector<uint32_t> thunk_locations_;
-  size_t current_thunk_to_write_;
-
-  // ReserveSpace() tracks unprocessed patches.
-  typedef std::pair<MethodReference, uint32_t> UnprocessedPatch;
-  std::deque<UnprocessedPatch> unprocessed_patches_;
-
   DISALLOW_COPY_AND_ASSIGN(Thumb2RelativeCallPatcher);
 };
 
+class OatWriter::Arm64RelativeCallPatcher FINAL : public ArmBaseRelativeCallPatcher {
+ public:
+  explicit Arm64RelativeCallPatcher(OatWriter* writer)
+      : ArmBaseRelativeCallPatcher(writer, kArm64, CompileThunkCode(),
+                                   kMaxPositiveDisplacement, kMaxNegativeDisplacement) {
+  }
+
+  void Patch(std::vector<uint8_t>* code, uint32_t literal_offset, uint32_t patch_offset,
+             uint32_t target_offset) OVERRIDE {
+    DCHECK_LE(literal_offset + 4u, code->size());
+    DCHECK_EQ(literal_offset & 3u, 0u);
+    DCHECK_EQ(patch_offset & 3u, 0u);
+    DCHECK_EQ(target_offset & 3u, 0u);
+    uint32_t displacement = CalculateDisplacement(patch_offset, target_offset & ~1u);
+    DCHECK_EQ(displacement & 3u, 0u);
+    DCHECK((displacement >> 27) == 0u || (displacement >> 27) == 31u);  // 28-bit signed.
+    uint32_t value = (displacement & 0x0fffffffu) >> 2;
+    value |= 0x94000000;  // BL
+
+    uint8_t* addr = &(*code)[literal_offset];
+    // Check that we're just overwriting an existing BL.
+    DCHECK_EQ(addr[3] & 0xfc, 0x94);
+    // Write the new BL.
+    addr[0] = (value >> 0) & 0xff;
+    addr[1] = (value >> 8) & 0xff;
+    addr[2] = (value >> 16) & 0xff;
+    addr[3] = (value >> 24) & 0xff;
+  }
+
+ private:
+  static std::vector<uint8_t> CompileThunkCode() {
+    // The thunk just uses the entry point in the ArtMethod. This works even for calls
+    // to the generic JNI and interpreter trampolines.
+    arm64::Arm64Assembler assembler;
+    Offset offset(mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value());
+    assembler.JumpTo(ManagedRegister(arm64::X0), offset, ManagedRegister(arm64::IP0));
+    std::vector<uint8_t> thunk_code(assembler.CodeSize());
+    MemoryRegion code(thunk_code.data(), thunk_code.size());
+    assembler.FinalizeInstructions(code);
+    return thunk_code;
+  }
+
+  // Maximum positive and negative displacement measured from the patch location.
+  // (Signed 28 bit displacement with the last bit 0 has range [-2^27, 2^27-4] measured from
+  // the ARM64 PC pointing to the BL.)
+  static constexpr uint32_t kMaxPositiveDisplacement = (1u << 27) - 4u;
+  static constexpr uint32_t kMaxNegativeDisplacement = (1u << 27);
+
+  DISALLOW_COPY_AND_ASSIGN(Arm64RelativeCallPatcher);
+};
+
 #define DCHECK_OFFSET() \
   DCHECK_EQ(static_cast<off_t>(file_offset + relative_offset), out->Seek(0, kSeekCurrent)) \
     << "file_offset=" << file_offset << " relative_offset=" << relative_offset
@@ -373,7 +449,8 @@
       relative_call_patcher_.reset(new Thumb2RelativeCallPatcher(this));
       break;
     case kArm64:
-      // TODO: Implement relative calls for arm64.
+      relative_call_patcher_.reset(new Arm64RelativeCallPatcher(this));
+      break;
     default:
       relative_call_patcher_.reset(new NoRelativeCallPatcher);
       break;
@@ -868,8 +945,8 @@
     : OatDexMethodVisitor(writer, relative_offset),
       out_(out),
       file_offset_(file_offset),
-      self_(Thread::Current()),
-      old_no_thread_suspension_cause_(self_->StartAssertNoThreadSuspension("OatWriter patching")),
+      soa_(Thread::Current()),
+      no_thread_suspension_(soa_.Self(), "OatWriter patching"),
       class_linker_(Runtime::Current()->GetClassLinker()),
       dex_cache_(nullptr) {
     if (writer_->image_writer_ != nullptr) {
@@ -877,12 +954,9 @@
       CHECK(writer_->image_writer_->IsImageAddressSpaceReady());
       patched_code_.reserve(16 * KB);
     }
-    self_->TransitionFromSuspendedToRunnable();
   }
 
   ~WriteCodeMethodVisitor() UNLOCK_FUNCTION(Locks::mutator_lock_) {
-    self_->EndAssertNoThreadSuspension(old_no_thread_suspension_cause_);
-    self_->TransitionFromRunnableToSuspended(kNative);
   }
 
   bool StartClass(const DexFile* dex_file, size_t class_def_index)
@@ -997,9 +1071,9 @@
 
  private:
   OutputStream* const out_;
-  size_t const file_offset_;
-  Thread* const self_;
-  const char* const old_no_thread_suspension_cause_;  // TODO: Use ScopedAssertNoThreadSuspension.
+  const size_t file_offset_;
+  const ScopedObjectAccess soa_;
+  const ScopedAssertNoThreadSuspension no_thread_suspension_;
   ClassLinker* const class_linker_;
   mirror::DexCache* dex_cache_;
   std::vector<uint8_t> patched_code_;
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index 5545ba8..a1e61b9 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -330,7 +330,9 @@
   class RelativeCallPatcher;
   class NoRelativeCallPatcher;
   class X86RelativeCallPatcher;
+  class ArmBaseRelativeCallPatcher;
   class Thumb2RelativeCallPatcher;
+  class Arm64RelativeCallPatcher;
 
   std::unique_ptr<RelativeCallPatcher> relative_call_patcher_;
 
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 5015bd0..5bcc65b 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -102,28 +102,19 @@
 
   uint32_t pos = 1;
   for (int i = 0; i < number_of_parameters; i++) {
-    switch (shorty[pos++]) {
-      case 'F':
-      case 'D': {
-        return false;
-      }
-
-      default: {
-        // integer and reference parameters.
-        HParameterValue* parameter =
-            new (arena_) HParameterValue(parameter_index++, Primitive::GetType(shorty[pos - 1]));
-        entry_block_->AddInstruction(parameter);
-        HLocal* local = GetLocalAt(locals_index++);
-        // Store the parameter value in the local that the dex code will use
-        // to reference that parameter.
-        entry_block_->AddInstruction(new (arena_) HStoreLocal(local, parameter));
-        if (parameter->GetType() == Primitive::kPrimLong) {
-          i++;
-          locals_index++;
-          parameter_index++;
-        }
-        break;
-      }
+    HParameterValue* parameter =
+        new (arena_) HParameterValue(parameter_index++, Primitive::GetType(shorty[pos++]));
+    entry_block_->AddInstruction(parameter);
+    HLocal* local = GetLocalAt(locals_index++);
+    // Store the parameter value in the local that the dex code will use
+    // to reference that parameter.
+    entry_block_->AddInstruction(new (arena_) HStoreLocal(local, parameter));
+    bool is_wide = (parameter->GetType() == Primitive::kPrimLong)
+        || (parameter->GetType() == Primitive::kPrimDouble);
+    if (is_wide) {
+      i++;
+      locals_index++;
+      parameter_index++;
     }
   }
   return true;
@@ -402,10 +393,8 @@
   uint32_t argument_index = start_index;
   for (size_t i = start_index; i < number_of_vreg_arguments; i++, argument_index++) {
     Primitive::Type type = Primitive::GetType(descriptor[descriptor_index++]);
-    if (!IsTypeSupported(type)) {
-      return false;
-    }
-    if (!is_range && type == Primitive::kPrimLong && args[i] + 1 != args[i + 1]) {
+    bool is_wide = (type == Primitive::kPrimLong) || (type == Primitive::kPrimDouble);
+    if (!is_range && is_wide && args[i] + 1 != args[i + 1]) {
       LOG(WARNING) << "Non sequential register pair in " << dex_compilation_unit_->GetSymbol()
                    << " at " << dex_offset;
       // We do not implement non sequential register pair.
@@ -413,15 +402,11 @@
     }
     HInstruction* arg = LoadLocal(is_range ? register_index + i : args[i], type);
     invoke->SetArgumentAt(argument_index, arg);
-    if (type == Primitive::kPrimLong) {
+    if (is_wide) {
       i++;
     }
   }
 
-  if (!IsTypeSupported(return_type)) {
-    return false;
-  }
-
   DCHECK_EQ(argument_index, number_of_arguments);
   current_block_->AddInstruction(invoke);
   return true;
@@ -648,17 +633,22 @@
     }
 
     case Instruction::RETURN: {
-      BuildReturn(instruction, Primitive::kPrimInt);
+      DCHECK_NE(return_type_, Primitive::kPrimNot);
+      DCHECK_NE(return_type_, Primitive::kPrimLong);
+      DCHECK_NE(return_type_, Primitive::kPrimDouble);
+      BuildReturn(instruction, return_type_);
       break;
     }
 
     case Instruction::RETURN_OBJECT: {
-      BuildReturn(instruction, Primitive::kPrimNot);
+      DCHECK(return_type_ == Primitive::kPrimNot);
+      BuildReturn(instruction, return_type_);
       break;
     }
 
     case Instruction::RETURN_WIDE: {
-      BuildReturn(instruction, Primitive::kPrimLong);
+      DCHECK(return_type_ == Primitive::kPrimDouble || return_type_ == Primitive::kPrimLong);
+      BuildReturn(instruction, return_type_);
       break;
     }
 
@@ -698,6 +688,16 @@
       break;
     }
 
+    case Instruction::ADD_DOUBLE: {
+      Binop_23x<HAdd>(instruction, Primitive::kPrimDouble);
+      break;
+    }
+
+    case Instruction::ADD_FLOAT: {
+      Binop_23x<HAdd>(instruction, Primitive::kPrimFloat);
+      break;
+    }
+
     case Instruction::SUB_INT: {
       Binop_23x<HSub>(instruction, Primitive::kPrimInt);
       break;
@@ -718,6 +718,16 @@
       break;
     }
 
+    case Instruction::ADD_DOUBLE_2ADDR: {
+      Binop_12x<HAdd>(instruction, Primitive::kPrimDouble);
+      break;
+    }
+
+    case Instruction::ADD_FLOAT_2ADDR: {
+      Binop_12x<HAdd>(instruction, Primitive::kPrimFloat);
+      break;
+    }
+
     case Instruction::SUB_INT_2ADDR: {
       Binop_12x<HSub>(instruction, Primitive::kPrimInt);
       break;
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index e143786..7d7b188 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_BUILDER_H_
 
 #include "dex_file.h"
+#include "dex_file-inl.h"
 #include "driver/compiler_driver.h"
 #include "driver/dex_compilation_unit.h"
 #include "primitive.h"
@@ -32,9 +33,9 @@
 class HGraphBuilder : public ValueObject {
  public:
   HGraphBuilder(ArenaAllocator* arena,
-                DexCompilationUnit* dex_compilation_unit = nullptr,
-                const DexFile* dex_file = nullptr,
-                CompilerDriver* driver = nullptr)
+                DexCompilationUnit* dex_compilation_unit,
+                const DexFile* dex_file,
+                CompilerDriver* driver)
       : arena_(arena),
         branch_targets_(arena, 0),
         locals_(arena, 0),
@@ -46,7 +47,24 @@
         constant1_(nullptr),
         dex_file_(dex_file),
         dex_compilation_unit_(dex_compilation_unit),
-        compiler_driver_(driver) {}
+        compiler_driver_(driver),
+        return_type_(Primitive::GetType(dex_compilation_unit_->GetShorty()[0])) {}
+
+  // Only for unit testing.
+  HGraphBuilder(ArenaAllocator* arena, Primitive::Type return_type = Primitive::kPrimInt)
+      : arena_(arena),
+        branch_targets_(arena, 0),
+        locals_(arena, 0),
+        entry_block_(nullptr),
+        exit_block_(nullptr),
+        current_block_(nullptr),
+        graph_(nullptr),
+        constant0_(nullptr),
+        constant1_(nullptr),
+        dex_file_(nullptr),
+        dex_compilation_unit_(nullptr),
+        compiler_driver_(nullptr),
+        return_type_(return_type) {}
 
   HGraph* BuildGraph(const DexFile::CodeItem& code);
 
@@ -128,6 +146,7 @@
   const DexFile* const dex_file_;
   DexCompilationUnit* const dex_compilation_unit_;
   CompilerDriver* const compiler_driver_;
+  const Primitive::Type return_type_;
 
   DISALLOW_COPY_AND_ASSIGN(HGraphBuilder);
 };
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 2a9a7b3..e6fe067 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -189,6 +189,9 @@
       if (loc.GetPolicy() == Location::kRequiresRegister) {
         loc = Location::RegisterLocation(
             AllocateFreeRegister(input->GetType(), blocked_registers_));
+      } else if (loc.GetPolicy() == Location::kRequiresFpuRegister) {
+        loc = Location::FpuRegisterLocation(
+            AllocateFreeRegister(input->GetType(), blocked_registers_));
       } else {
         DCHECK_EQ(loc.GetPolicy(), Location::kAny);
         HLoadLocal* load = input->AsLoadLocal();
@@ -223,6 +226,10 @@
         result_location = Location::RegisterLocation(
             AllocateFreeRegister(instruction->GetType(), blocked_registers_));
         break;
+      case Location::kRequiresFpuRegister:
+        result_location = Location::FpuRegisterLocation(
+            AllocateFreeRegister(instruction->GetType(), blocked_registers_));
+        break;
       case Location::kSameAsFirstInput:
         result_location = locations->InAt(0);
         break;
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index b58f3b3..c7623fe 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -203,28 +203,42 @@
   DISALLOW_COPY_AND_ASSIGN(CodeGenerator);
 };
 
-template <typename T>
+template <typename C, typename F>
 class CallingConvention {
  public:
-  CallingConvention(const T* registers, int number_of_registers)
-      : registers_(registers), number_of_registers_(number_of_registers) {}
+  CallingConvention(const C* registers,
+                    size_t number_of_registers,
+                    const F* fpu_registers,
+                    size_t number_of_fpu_registers)
+      : registers_(registers),
+        number_of_registers_(number_of_registers),
+        fpu_registers_(fpu_registers),
+        number_of_fpu_registers_(number_of_fpu_registers) {}
 
   size_t GetNumberOfRegisters() const { return number_of_registers_; }
+  size_t GetNumberOfFpuRegisters() const { return number_of_fpu_registers_; }
 
-  T GetRegisterAt(size_t index) const {
+  C GetRegisterAt(size_t index) const {
     DCHECK_LT(index, number_of_registers_);
     return registers_[index];
   }
 
-  uint8_t GetStackOffsetOf(size_t index) const {
+  F GetFpuRegisterAt(size_t index) const {
+    DCHECK_LT(index, number_of_fpu_registers_);
+    return fpu_registers_[index];
+  }
+
+  size_t GetStackOffsetOf(size_t index) const {
     // We still reserve the space for parameters passed by registers.
     // Add one for the method pointer.
     return (index + 1) * kVRegSize;
   }
 
  private:
-  const T* registers_;
+  const C* registers_;
   const size_t number_of_registers_;
+  const F* fpu_registers_;
+  const size_t number_of_fpu_registers_;
 
   DISALLOW_COPY_AND_ASSIGN(CallingConvention);
 };
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 2aa04d4..d116905 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -47,12 +47,16 @@
 static constexpr Register kRuntimeParameterCoreRegisters[] = { R0, R1, R2 };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
+static constexpr DRegister kRuntimeParameterFpuRegisters[] = { };
+static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
 
-class InvokeRuntimeCallingConvention : public CallingConvention<Register> {
+class InvokeRuntimeCallingConvention : public CallingConvention<Register, DRegister> {
  public:
   InvokeRuntimeCallingConvention()
       : CallingConvention(kRuntimeParameterCoreRegisters,
-                          kRuntimeParameterCoreRegistersLength) {}
+                          kRuntimeParameterCoreRegistersLength,
+                          kRuntimeParameterFpuRegisters,
+                          kRuntimeParameterFpuRegistersLength) {}
 
  private:
   DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
@@ -218,6 +222,10 @@
   return blocked_registers + kNumberOfAllocIds;
 }
 
+static bool* GetBlockedDRegisters(bool* blocked_registers) {
+  return blocked_registers + kNumberOfCoreRegisters + kNumberOfSRegisters;
+}
+
 ManagedRegister CodeGeneratorARM::AllocateFreeRegister(Primitive::Type type,
                                                        bool* blocked_registers) const {
   switch (type) {
@@ -262,8 +270,10 @@
     }
 
     case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << type;
+    case Primitive::kPrimDouble: {
+      int reg = AllocateFreeRegisterInternal(GetBlockedDRegisters(blocked_registers), kNumberOfDRegisters);
+      return ArmManagedRegister::FromDRegister(static_cast<DRegister>(reg));
+    }
 
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << type;
@@ -274,6 +284,7 @@
 
 void CodeGeneratorARM::SetupBlockedRegisters(bool* blocked_registers) const {
   bool* blocked_register_pairs = GetBlockedRegisterPairs(blocked_registers);
+  bool* blocked_fpu_registers = GetBlockedDRegisters(blocked_registers);
 
   // Don't allocate the dalvik style register pair passing.
   blocked_register_pairs[R1_R2] = true;
@@ -300,6 +311,15 @@
   blocked_registers[R8] = true;
   blocked_registers[R10] = true;
   blocked_registers[R11] = true;
+
+  blocked_fpu_registers[D8] = true;
+  blocked_fpu_registers[D9] = true;
+  blocked_fpu_registers[D10] = true;
+  blocked_fpu_registers[D11] = true;
+  blocked_fpu_registers[D12] = true;
+  blocked_fpu_registers[D13] = true;
+  blocked_fpu_registers[D14] = true;
+  blocked_fpu_registers[D15] = true;
 }
 
 size_t CodeGeneratorARM::GetNumberOfRegisters() const {
@@ -348,16 +368,14 @@
 Location CodeGeneratorARM::GetStackLocation(HLoadLocal* load) const {
   switch (load->GetType()) {
     case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
       return Location::DoubleStackSlot(GetStackSlot(load->GetLocal()));
       break;
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
-      return Location::StackSlot(GetStackSlot(load->GetLocal()));
-
     case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented type " << load->GetType();
+      return Location::StackSlot(GetStackSlot(load->GetLocal()));
 
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -378,6 +396,7 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
     case Primitive::kPrimNot: {
       uint32_t index = gp_index_++;
       if (index < calling_convention.GetNumberOfRegisters()) {
@@ -387,7 +406,8 @@
       }
     }
 
-    case Primitive::kPrimLong: {
+    case Primitive::kPrimLong:
+    case Primitive::kPrimDouble: {
       uint32_t index = gp_index_;
       gp_index_ += 2;
       if (index + 1 < calling_convention.GetNumberOfRegisters()) {
@@ -400,11 +420,6 @@
       }
     }
 
-    case Primitive::kPrimDouble:
-    case Primitive::kPrimFloat:
-      LOG(FATAL) << "Unimplemented parameter type " << type;
-      break;
-
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unexpected parameter type " << type;
       break;
@@ -419,14 +434,32 @@
   if (destination.IsRegister()) {
     if (source.IsRegister()) {
       __ Mov(destination.AsArm().AsCoreRegister(), source.AsArm().AsCoreRegister());
+    } else if (source.IsFpuRegister()) {
+      __ vmovrs(destination.AsArm().AsCoreRegister(),
+                source.AsArm().AsOverlappingDRegisterLow());
     } else {
       __ ldr(destination.AsArm().AsCoreRegister(), Address(SP, source.GetStackIndex()));
     }
+  } else if (destination.IsFpuRegister()) {
+    if (source.IsRegister()) {
+      __ vmovsr(destination.AsArm().AsOverlappingDRegisterLow(),
+                source.AsArm().AsCoreRegister());
+    } else if (source.IsFpuRegister()) {
+      __ vmovs(destination.AsArm().AsOverlappingDRegisterLow(),
+               source.AsArm().AsOverlappingDRegisterLow());
+    } else {
+      __ vldrs(destination.AsArm().AsOverlappingDRegisterLow(),
+               Address(SP, source.GetStackIndex()));
+    }
   } else {
     DCHECK(destination.IsStackSlot());
     if (source.IsRegister()) {
       __ str(source.AsArm().AsCoreRegister(), Address(SP, destination.GetStackIndex()));
+    } else if (source.IsFpuRegister()) {
+      __ vstrs(source.AsArm().AsOverlappingDRegisterLow(),
+               Address(SP, destination.GetStackIndex()));
     } else {
+      DCHECK(source.IsStackSlot());
       __ ldr(IP, Address(SP, source.GetStackIndex()));
       __ str(IP, Address(SP, destination.GetStackIndex()));
     }
@@ -441,6 +474,8 @@
     if (source.IsRegister()) {
       __ Mov(destination.AsArm().AsRegisterPairLow(), source.AsArm().AsRegisterPairLow());
       __ Mov(destination.AsArm().AsRegisterPairHigh(), source.AsArm().AsRegisterPairHigh());
+    } else if (source.IsFpuRegister()) {
+      LOG(FATAL) << "Unimplemented";
     } else if (source.IsQuickParameter()) {
       uint32_t argument_index = source.GetQuickParameterIndex();
       InvokeDexCallingConvention calling_convention;
@@ -458,6 +493,12 @@
                           SP, source.GetStackIndex());
       }
     }
+  } else if (destination.IsFpuRegister()) {
+    if (source.IsDoubleStackSlot()) {
+      __ vldrd(destination.AsArm().AsDRegister(), Address(SP, source.GetStackIndex()));
+    } else {
+      LOG(FATAL) << "Unimplemented";
+    }
   } else if (destination.IsQuickParameter()) {
     InvokeDexCallingConvention calling_convention;
     uint32_t argument_index = destination.GetQuickParameterIndex();
@@ -465,6 +506,8 @@
       __ Mov(calling_convention.GetRegisterAt(argument_index), source.AsArm().AsRegisterPairLow());
       __ str(source.AsArm().AsRegisterPairHigh(),
              Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1)));
+    } else if (source.IsFpuRegister()) {
+      LOG(FATAL) << "Unimplemented";
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ ldr(calling_convention.GetRegisterAt(argument_index), Address(SP, source.GetStackIndex()));
@@ -489,6 +532,8 @@
       __ ldr(R0,
              Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1) + GetFrameSize()));
       __ str(R0, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
+    } else if (source.IsFpuRegister()) {
+      __ vstrd(source.AsArm().AsDRegister(), Address(SP, destination.GetStackIndex()));
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ ldr(IP, Address(SP, source.GetStackIndex()));
@@ -535,15 +580,17 @@
       case Primitive::kPrimShort:
       case Primitive::kPrimInt:
       case Primitive::kPrimNot:
+      case Primitive::kPrimFloat:
         Move32(location, Location::StackSlot(stack_slot));
         break;
 
       case Primitive::kPrimLong:
+      case Primitive::kPrimDouble:
         Move64(location, Location::DoubleStackSlot(stack_slot));
         break;
 
       default:
-        LOG(FATAL) << "Unimplemented type " << instruction->GetType();
+        LOG(FATAL) << "Unexpected type " << instruction->GetType();
     }
   } else {
     DCHECK((instruction->GetNext() == move_for) || instruction->GetNext()->IsTemporary());
@@ -554,15 +601,17 @@
       case Primitive::kPrimShort:
       case Primitive::kPrimNot:
       case Primitive::kPrimInt:
+      case Primitive::kPrimFloat:
         Move32(location, locations->Out());
         break;
 
       case Primitive::kPrimLong:
+      case Primitive::kPrimDouble:
         Move64(location, locations->Out());
         break;
 
       default:
-        LOG(FATAL) << "Unimplemented type " << instruction->GetType();
+        LOG(FATAL) << "Unexpected type " << instruction->GetType();
     }
   }
 }
@@ -608,18 +657,14 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(if_instr, LocationSummary::kNoCall);
   HInstruction* cond = if_instr->InputAt(0);
-  DCHECK(cond->IsCondition());
-  HCondition* condition = cond->AsCondition();
-  if (condition->NeedsMaterialization()) {
+  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
     locations->SetInAt(0, Location::RequiresRegister(), Location::kDiesAtEntry);
   }
 }
 
 void InstructionCodeGeneratorARM::VisitIf(HIf* if_instr) {
   HInstruction* cond = if_instr->InputAt(0);
-  DCHECK(cond->IsCondition());
-  HCondition* condition = cond->AsCondition();
-  if (condition->NeedsMaterialization()) {
+  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
     // Condition has been materialized, compare the output to 0
     DCHECK(if_instr->GetLocations()->InAt(0).IsRegister());
     __ cmp(if_instr->GetLocations()->InAt(0).AsArm().AsCoreRegister(),
@@ -628,7 +673,7 @@
   } else {
     // Condition has not been materialized, use its inputs as the comparison and its
     // condition as the branch condition.
-    LocationSummary* locations = condition->GetLocations();
+    LocationSummary* locations = cond->GetLocations();
     if (locations->InAt(1).IsRegister()) {
       __ cmp(locations->InAt(0).AsArm().AsCoreRegister(),
              ShifterOperand(locations->InAt(1).AsArm().AsCoreRegister()));
@@ -645,7 +690,7 @@
       }
     }
     __ b(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()),
-         ARMCondition(condition->GetCondition()));
+         ARMCondition(cond->AsCondition()->GetCondition()));
   }
 
   if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfFalseSuccessor())) {
@@ -764,15 +809,17 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
+    case Primitive::kPrimFloat:
       locations->SetInAt(1, Location::StackSlot(codegen_->GetStackSlot(store->GetLocal())));
       break;
 
     case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
       locations->SetInAt(1, Location::DoubleStackSlot(codegen_->GetStackSlot(store->GetLocal())));
       break;
 
     default:
-      LOG(FATAL) << "Unimplemented local type " << store->InputAt(1)->GetType();
+      LOG(FATAL) << "Unexpected local type " << store->InputAt(1)->GetType();
   }
 }
 
@@ -816,10 +863,12 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
+    case Primitive::kPrimFloat:
       locations->SetInAt(0, ArmCoreLocation(R0));
       break;
 
     case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
       locations->SetInAt(
           0, Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R0_R1)));
       break;
@@ -838,10 +887,12 @@
       case Primitive::kPrimShort:
       case Primitive::kPrimInt:
       case Primitive::kPrimNot:
+      case Primitive::kPrimFloat:
         DCHECK_EQ(ret->GetLocations()->InAt(0).AsArm().AsCoreRegister(), R0);
         break;
 
       case Primitive::kPrimLong:
+      case Primitive::kPrimDouble:
         DCHECK_EQ(ret->GetLocations()->InAt(0).AsArm().AsRegisterPair(), R0_R1);
         break;
 
@@ -911,20 +962,17 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
+    case Primitive::kPrimFloat:
       locations->SetOut(ArmCoreLocation(R0));
       break;
 
     case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
       locations->SetOut(Location::RegisterLocation(ArmManagedRegister::FromRegisterPair(R0_R1)));
       break;
 
     case Primitive::kPrimVoid:
       break;
-
-    case Primitive::kPrimDouble:
-    case Primitive::kPrimFloat:
-      LOG(FATAL) << "Unimplemented return type " << invoke->GetType();
-      break;
   }
 }
 
@@ -967,15 +1015,16 @@
       break;
     }
 
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble: {
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
       break;
+    }
 
     default:
-      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
   }
 }
 
@@ -1003,15 +1052,20 @@
              ShifterOperand(locations->InAt(1).AsArm().AsRegisterPairHigh()));
       break;
 
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+    case Primitive::kPrimFloat:
+      __ vadds(locations->Out().AsArm().AsOverlappingDRegisterLow(),
+               locations->InAt(0).AsArm().AsOverlappingDRegisterLow(),
+               locations->InAt(1).AsArm().AsOverlappingDRegisterLow());
+      break;
+
+    case Primitive::kPrimDouble:
+      __ vaddd(locations->Out().AsArm().AsDRegister(),
+               locations->InAt(0).AsArm().AsDRegister(),
+               locations->InAt(1).AsArm().AsDRegister());
       break;
 
     default:
-      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
   }
 }
 
@@ -1311,8 +1365,9 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  // TODO: Have a normalization phase that makes this instruction never used.
-  locations->SetOut(Location::SameAsFirstInput());
+  if (instruction->HasUses()) {
+    locations->SetOut(Location::SameAsFirstInput());
+  }
 }
 
 void InstructionCodeGeneratorARM::VisitNullCheck(HNullCheck* instruction) {
@@ -1321,12 +1376,15 @@
 
   LocationSummary* locations = instruction->GetLocations();
   Location obj = locations->InAt(0);
-  DCHECK(obj.Equals(locations->Out()));
 
   if (obj.IsRegister()) {
     __ cmp(obj.AsArm().AsCoreRegister(), ShifterOperand(0));
+    __ b(slow_path->GetEntryLabel(), EQ);
+  } else {
+    DCHECK(obj.IsConstant()) << obj;
+    DCHECK_EQ(obj.GetConstant()->AsIntConstant()->GetValue(), 0);
+    __ b(slow_path->GetEntryLabel());
   }
-  __ b(slow_path->GetEntryLabel(), EQ);
 }
 
 void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) {
@@ -1550,8 +1608,9 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  // TODO: Have a normalization phase that makes this instruction never used.
-  locations->SetOut(Location::SameAsFirstInput());
+  if (instruction->HasUses()) {
+    locations->SetOut(Location::SameAsFirstInput());
+  }
 }
 
 void InstructionCodeGeneratorARM::VisitBoundsCheck(HBoundsCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 8c86b7a..b5de8ed 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -32,11 +32,16 @@
 static constexpr Register kParameterCoreRegisters[] = { R1, R2, R3 };
 static constexpr RegisterPair kParameterCorePairRegisters[] = { R1_R2, R2_R3 };
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
+static constexpr DRegister kParameterFpuRegisters[] = { };
+static constexpr size_t kParameterFpuRegistersLength = 0;
 
-class InvokeDexCallingConvention : public CallingConvention<Register> {
+class InvokeDexCallingConvention : public CallingConvention<Register, DRegister> {
  public:
   InvokeDexCallingConvention()
-      : CallingConvention(kParameterCoreRegisters, kParameterCoreRegistersLength) {}
+      : CallingConvention(kParameterCoreRegisters,
+                          kParameterCoreRegistersLength,
+                          kParameterFpuRegisters,
+                          kParameterFpuRegistersLength) {}
 
   RegisterPair GetRegisterPairAt(size_t argument_index) {
     DCHECK_LT(argument_index + 1, GetNumberOfRegisters());
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 6791003..328fc93 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -47,12 +47,16 @@
 static constexpr Register kRuntimeParameterCoreRegisters[] = { EAX, ECX, EDX };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
+static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { };
+static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
 
-class InvokeRuntimeCallingConvention : public CallingConvention<Register> {
+class InvokeRuntimeCallingConvention : public CallingConvention<Register, XmmRegister> {
  public:
   InvokeRuntimeCallingConvention()
       : CallingConvention(kRuntimeParameterCoreRegisters,
-                          kRuntimeParameterCoreRegistersLength) {}
+                          kRuntimeParameterCoreRegistersLength,
+                          kRuntimeParameterFpuRegisters,
+                          kRuntimeParameterFpuRegistersLength) {}
 
  private:
   DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
@@ -193,6 +197,10 @@
   return blocked_registers + kNumberOfAllocIds;
 }
 
+static bool* GetBlockedXmmRegisters(bool* blocked_registers) {
+  return blocked_registers + kNumberOfCpuRegisters;
+}
+
 ManagedRegister CodeGeneratorX86::AllocateFreeRegister(Primitive::Type type,
                                                        bool* blocked_registers) const {
   switch (type) {
@@ -238,8 +246,11 @@
     }
 
     case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << type;
+    case Primitive::kPrimDouble: {
+      XmmRegister reg = static_cast<XmmRegister>(AllocateFreeRegisterInternal(
+          GetBlockedXmmRegisters(blocked_registers), kNumberOfXmmRegisters));
+      return X86ManagedRegister::FromXmmRegister(reg);
+    }
 
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << type;
@@ -316,16 +327,14 @@
 Location CodeGeneratorX86::GetStackLocation(HLoadLocal* load) const {
   switch (load->GetType()) {
     case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
       return Location::DoubleStackSlot(GetStackSlot(load->GetLocal()));
       break;
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
-      return Location::StackSlot(GetStackSlot(load->GetLocal()));
-
     case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented type " << load->GetType();
+      return Location::StackSlot(GetStackSlot(load->GetLocal()));
 
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -346,6 +355,7 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
     case Primitive::kPrimNot: {
       uint32_t index = gp_index_++;
       if (index < calling_convention.GetNumberOfRegisters()) {
@@ -355,7 +365,8 @@
       }
     }
 
-    case Primitive::kPrimLong: {
+    case Primitive::kPrimLong:
+    case Primitive::kPrimDouble: {
       uint32_t index = gp_index_;
       gp_index_ += 2;
       if (index + 1 < calling_convention.GetNumberOfRegisters()) {
@@ -368,11 +379,6 @@
       }
     }
 
-    case Primitive::kPrimDouble:
-    case Primitive::kPrimFloat:
-      LOG(FATAL) << "Unimplemented parameter type " << type;
-      break;
-
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unexpected parameter type " << type;
       break;
@@ -387,13 +393,27 @@
   if (destination.IsRegister()) {
     if (source.IsRegister()) {
       __ movl(destination.AsX86().AsCpuRegister(), source.AsX86().AsCpuRegister());
+    } else if (source.IsFpuRegister()) {
+      __ movd(destination.AsX86().AsCpuRegister(), source.AsX86().AsXmmRegister());
     } else {
       DCHECK(source.IsStackSlot());
       __ movl(destination.AsX86().AsCpuRegister(), Address(ESP, source.GetStackIndex()));
     }
+  } else if (destination.IsFpuRegister()) {
+    if (source.IsRegister()) {
+      __ movd(destination.AsX86().AsXmmRegister(), source.AsX86().AsCpuRegister());
+    } else if (source.IsFpuRegister()) {
+      __ movaps(destination.AsX86().AsXmmRegister(), source.AsX86().AsXmmRegister());
+    } else {
+      DCHECK(source.IsStackSlot());
+      __ movss(destination.AsX86().AsXmmRegister(), Address(ESP, source.GetStackIndex()));
+    }
   } else {
+    DCHECK(destination.IsStackSlot());
     if (source.IsRegister()) {
       __ movl(Address(ESP, destination.GetStackIndex()), source.AsX86().AsCpuRegister());
+    } else if (source.IsFpuRegister()) {
+      __ movss(Address(ESP, destination.GetStackIndex()), source.AsX86().AsXmmRegister());
     } else {
       DCHECK(source.IsStackSlot());
       __ pushl(Address(ESP, source.GetStackIndex()));
@@ -410,6 +430,8 @@
     if (source.IsRegister()) {
       __ movl(destination.AsX86().AsRegisterPairLow(), source.AsX86().AsRegisterPairLow());
       __ movl(destination.AsX86().AsRegisterPairHigh(), source.AsX86().AsRegisterPairHigh());
+    } else if (source.IsFpuRegister()) {
+      LOG(FATAL) << "Unimplemented";
     } else if (source.IsQuickParameter()) {
       uint32_t argument_index = source.GetQuickParameterIndex();
       InvokeDexCallingConvention calling_convention;
@@ -430,6 +452,8 @@
       __ movl(calling_convention.GetRegisterAt(argument_index), source.AsX86().AsRegisterPairLow());
       __ movl(Address(ESP, calling_convention.GetStackOffsetOf(argument_index + 1)),
               source.AsX86().AsRegisterPairHigh());
+    } else if (source.IsFpuRegister()) {
+      LOG(FATAL) << "Unimplemented";
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movl(calling_convention.GetRegisterAt(argument_index),
@@ -437,6 +461,12 @@
       __ pushl(Address(ESP, source.GetHighStackIndex(kX86WordSize)));
       __ popl(Address(ESP, calling_convention.GetStackOffsetOf(argument_index + 1)));
     }
+  } else if (destination.IsFpuRegister()) {
+    if (source.IsDoubleStackSlot()) {
+      __ movsd(destination.AsX86().AsXmmRegister(), Address(ESP, source.GetStackIndex()));
+    } else {
+      LOG(FATAL) << "Unimplemented";
+    }
   } else {
     DCHECK(destination.IsDoubleStackSlot());
     if (source.IsRegister()) {
@@ -448,9 +478,10 @@
       uint32_t argument_index = source.GetQuickParameterIndex();
       __ movl(Address(ESP, destination.GetStackIndex()),
               calling_convention.GetRegisterAt(argument_index));
-      __ pushl(Address(ESP,
-          calling_convention.GetStackOffsetOf(argument_index + 1) + GetFrameSize()));
-      __ popl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)));
+      DCHECK_EQ(calling_convention.GetStackOffsetOf(argument_index + 1) + GetFrameSize(),
+                static_cast<size_t>(destination.GetHighStackIndex(kX86WordSize)));
+    } else if (source.IsFpuRegister()) {
+      __ movsd(Address(ESP, destination.GetStackIndex()), source.AsX86().AsXmmRegister());
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ pushl(Address(ESP, source.GetStackIndex()));
@@ -479,6 +510,7 @@
       __ movl(Address(ESP, location.GetHighStackIndex(kX86WordSize)), Immediate(High32Bits(value)));
     }
   } else if (instruction->AsLoadLocal() != nullptr) {
+    int slot = GetStackSlot(instruction->AsLoadLocal()->GetLocal());
     switch (instruction->GetType()) {
       case Primitive::kPrimBoolean:
       case Primitive::kPrimByte:
@@ -486,12 +518,13 @@
       case Primitive::kPrimShort:
       case Primitive::kPrimInt:
       case Primitive::kPrimNot:
-        Move32(location, Location::StackSlot(GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
+      case Primitive::kPrimFloat:
+        Move32(location, Location::StackSlot(slot));
         break;
 
       case Primitive::kPrimLong:
-        Move64(location, Location::DoubleStackSlot(
-            GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
+      case Primitive::kPrimDouble:
+        Move64(location, Location::DoubleStackSlot(slot));
         break;
 
       default:
@@ -506,15 +539,17 @@
       case Primitive::kPrimShort:
       case Primitive::kPrimInt:
       case Primitive::kPrimNot:
+      case Primitive::kPrimFloat:
         Move32(location, instruction->GetLocations()->Out());
         break;
 
       case Primitive::kPrimLong:
+      case Primitive::kPrimDouble:
         Move64(location, instruction->GetLocations()->Out());
         break;
 
       default:
-        LOG(FATAL) << "Unimplemented type " << instruction->GetType();
+        LOG(FATAL) << "Unexpected type " << instruction->GetType();
     }
   }
 }
@@ -560,22 +595,18 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(if_instr, LocationSummary::kNoCall);
   HInstruction* cond = if_instr->InputAt(0);
-  DCHECK(cond->IsCondition());
-  HCondition* condition = cond->AsCondition();
-  if (condition->NeedsMaterialization()) {
+  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
     locations->SetInAt(0, Location::Any(), Location::kDiesAtEntry);
   }
 }
 
 void InstructionCodeGeneratorX86::VisitIf(HIf* if_instr) {
   HInstruction* cond = if_instr->InputAt(0);
-  DCHECK(cond->IsCondition());
-  HCondition* condition = cond->AsCondition();
-  if (condition->NeedsMaterialization()) {
+  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
     // Moves do not affect the eflags register, so if the condition is evaluated
     // just before the if, we don't need to evaluate it again.
-    if (!condition->IsBeforeWhenDisregardMoves(if_instr)) {
-      // Materialized condition, compare against 0
+    if (!cond->IsCondition() || !cond->AsCondition()->IsBeforeWhenDisregardMoves(if_instr)) {
+      // Materialized condition, compare against 0.
       Location lhs = if_instr->GetLocations()->InAt(0);
       if (lhs.IsRegister()) {
         __ cmpl(lhs.AsX86().AsCpuRegister(), Immediate(0));
@@ -585,8 +616,8 @@
     }
     __ j(kNotEqual,  codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
   } else {
-    Location lhs = condition->GetLocations()->InAt(0);
-    Location rhs = condition->GetLocations()->InAt(1);
+    Location lhs = cond->GetLocations()->InAt(0);
+    Location rhs = cond->GetLocations()->InAt(1);
     // LHS is guaranteed to be in a register (see LocationsBuilderX86::VisitCondition).
     if (rhs.IsRegister()) {
       __ cmpl(lhs.AsX86().AsCpuRegister(), rhs.AsX86().AsCpuRegister());
@@ -597,7 +628,7 @@
     } else {
       __ cmpl(lhs.AsX86().AsCpuRegister(), Address(ESP, rhs.GetStackIndex()));
     }
-    __ j(X86Condition(condition->GetCondition()),
+    __ j(X86Condition(cond->AsCondition()->GetCondition()),
          codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
   }
   if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfFalseSuccessor())) {
@@ -631,15 +662,17 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
+    case Primitive::kPrimFloat:
       locations->SetInAt(1, Location::StackSlot(codegen_->GetStackSlot(store->GetLocal())));
       break;
 
     case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
       locations->SetInAt(1, Location::DoubleStackSlot(codegen_->GetStackSlot(store->GetLocal())));
       break;
 
     default:
-      LOG(FATAL) << "Unimplemented local type " << store->InputAt(1)->GetType();
+      LOG(FATAL) << "Unknown local type " << store->InputAt(1)->GetType();
   }
   store->SetLocations(locations);
 }
@@ -772,8 +805,14 @@
           0, Location::RegisterLocation(X86ManagedRegister::FromRegisterPair(EAX_EDX)));
       break;
 
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(
+          0, Location::FpuRegisterLocation(X86ManagedRegister::FromXmmRegister(XMM0)));
+      break;
+
     default:
-      LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+      LOG(FATAL) << "Unknown return type " << ret->InputAt(0)->GetType();
   }
 }
 
@@ -793,8 +832,13 @@
         DCHECK_EQ(ret->GetLocations()->InAt(0).AsX86().AsRegisterPair(), EAX_EDX);
         break;
 
+      case Primitive::kPrimFloat:
+      case Primitive::kPrimDouble:
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsX86().AsXmmRegister(), XMM0);
+        break;
+
       default:
-        LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+        LOG(FATAL) << "Unknown return type " << ret->InputAt(0)->GetType();
     }
   }
   codegen_->GenerateFrameExit();
@@ -865,7 +909,7 @@
 
     case Primitive::kPrimDouble:
     case Primitive::kPrimFloat:
-      LOG(FATAL) << "Unimplemented return type " << invoke->GetType();
+      locations->SetOut(Location::FpuRegisterLocation(X86ManagedRegister::FromXmmRegister(XMM0)));
       break;
   }
 
@@ -907,64 +951,74 @@
       break;
     }
 
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble: {
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::SameAsFirstInput());
       break;
+    }
 
     default:
-      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+      break;
   }
 }
 
 void InstructionCodeGeneratorX86::VisitAdd(HAdd* add) {
   LocationSummary* locations = add->GetLocations();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+
   switch (add->GetResultType()) {
     case Primitive::kPrimInt: {
-      DCHECK_EQ(locations->InAt(0).AsX86().AsCpuRegister(),
-                locations->Out().AsX86().AsCpuRegister());
-      if (locations->InAt(1).IsRegister()) {
-        __ addl(locations->InAt(0).AsX86().AsCpuRegister(),
-                locations->InAt(1).AsX86().AsCpuRegister());
-      } else if (locations->InAt(1).IsConstant()) {
-        HConstant* instruction = locations->InAt(1).GetConstant();
+      DCHECK_EQ(first.AsX86().AsCpuRegister(), locations->Out().AsX86().AsCpuRegister());
+      if (second.IsRegister()) {
+        __ addl(first.AsX86().AsCpuRegister(), second.AsX86().AsCpuRegister());
+      } else if (second.IsConstant()) {
+        HConstant* instruction = second.GetConstant();
         Immediate imm(instruction->AsIntConstant()->GetValue());
-        __ addl(locations->InAt(0).AsX86().AsCpuRegister(), imm);
+        __ addl(first.AsX86().AsCpuRegister(), imm);
       } else {
-        __ addl(locations->InAt(0).AsX86().AsCpuRegister(),
-                Address(ESP, locations->InAt(1).GetStackIndex()));
+        __ addl(first.AsX86().AsCpuRegister(), Address(ESP, second.GetStackIndex()));
       }
       break;
     }
 
     case Primitive::kPrimLong: {
-      DCHECK_EQ(locations->InAt(0).AsX86().AsRegisterPair(),
+      DCHECK_EQ(first.AsX86().AsRegisterPair(),
                 locations->Out().AsX86().AsRegisterPair());
-      if (locations->InAt(1).IsRegister()) {
-        __ addl(locations->InAt(0).AsX86().AsRegisterPairLow(),
-                locations->InAt(1).AsX86().AsRegisterPairLow());
-        __ adcl(locations->InAt(0).AsX86().AsRegisterPairHigh(),
-                locations->InAt(1).AsX86().AsRegisterPairHigh());
+      if (second.IsRegister()) {
+        __ addl(first.AsX86().AsRegisterPairLow(), second.AsX86().AsRegisterPairLow());
+        __ adcl(first.AsX86().AsRegisterPairHigh(), second.AsX86().AsRegisterPairHigh());
       } else {
-        __ addl(locations->InAt(0).AsX86().AsRegisterPairLow(),
-                Address(ESP, locations->InAt(1).GetStackIndex()));
-        __ adcl(locations->InAt(0).AsX86().AsRegisterPairHigh(),
-                Address(ESP, locations->InAt(1).GetHighStackIndex(kX86WordSize)));
+        __ addl(first.AsX86().AsRegisterPairLow(), Address(ESP, second.GetStackIndex()));
+        __ adcl(first.AsX86().AsRegisterPairHigh(),
+                Address(ESP, second.GetHighStackIndex(kX86WordSize)));
       }
       break;
     }
 
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+    case Primitive::kPrimFloat: {
+      if (second.IsFpuRegister()) {
+        __ addss(first.AsX86().AsXmmRegister(), second.AsX86().AsXmmRegister());
+      } else {
+        __ addss(first.AsX86().AsXmmRegister(), Address(ESP, second.GetStackIndex()));
+      }
       break;
+    }
+
+    case Primitive::kPrimDouble: {
+      if (second.IsFpuRegister()) {
+        __ addsd(first.AsX86().AsXmmRegister(), second.AsX86().AsXmmRegister());
+      } else {
+        __ addsd(first.AsX86().AsXmmRegister(), Address(ESP, second.GetStackIndex()));
+      }
+      break;
+    }
 
     default:
-      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
   }
 }
 
@@ -1160,8 +1214,12 @@
   locations->SetInAt(0, Location::RequiresRegister());
   Primitive::Type field_type = instruction->GetFieldType();
   bool is_object_type = field_type == Primitive::kPrimNot;
-  bool dies_at_entry = !is_object_type;
-  if (field_type == Primitive::kPrimBoolean || field_type == Primitive::kPrimByte) {
+  bool is_byte_type = (field_type == Primitive::kPrimBoolean)
+      || (field_type == Primitive::kPrimByte);
+  // The register allocator does not support multiple
+  // inputs that die at entry with one in a specific register.
+  bool dies_at_entry = !is_object_type && !is_byte_type;
+  if (is_byte_type) {
     // Ensure the value is in a byte register.
     locations->SetInAt(1, X86CpuLocation(EAX), dies_at_entry);
   } else {
@@ -1302,8 +1360,9 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::Any());
-  // TODO: Have a normalization phase that makes this instruction never used.
-  locations->SetOut(Location::SameAsFirstInput());
+  if (instruction->HasUses()) {
+    locations->SetOut(Location::SameAsFirstInput());
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitNullCheck(HNullCheck* instruction) {
@@ -1312,13 +1371,16 @@
 
   LocationSummary* locations = instruction->GetLocations();
   Location obj = locations->InAt(0);
-  DCHECK(obj.Equals(locations->Out()));
 
   if (obj.IsRegister()) {
     __ cmpl(obj.AsX86().AsCpuRegister(), Immediate(0));
-  } else {
-    DCHECK(locations->InAt(0).IsStackSlot());
+  } else if (obj.IsStackSlot()) {
     __ cmpl(Address(ESP, obj.GetStackIndex()), Immediate(0));
+  } else {
+    DCHECK(obj.IsConstant()) << obj;
+    DCHECK_EQ(obj.GetConstant()->AsIntConstant()->GetValue(), 0);
+    __ jmp(slow_path->GetEntryLabel());
+    return;
   }
   __ j(kEqual, slow_path->GetEntryLabel());
 }
@@ -1436,16 +1498,21 @@
     locations->SetInAt(1, X86CpuLocation(calling_convention.GetRegisterAt(1)));
     locations->SetInAt(2, X86CpuLocation(calling_convention.GetRegisterAt(2)));
   } else {
+    bool is_byte_type = (value_type == Primitive::kPrimBoolean)
+        || (value_type == Primitive::kPrimByte);
     // We need the inputs to be different than the output in case of long operation.
-    bool dies_at_entry = value_type != Primitive::kPrimLong;
+    // In case of a byte operation, the register allocator does not support multiple
+    // inputs that die at entry with one in a specific register.
+    bool dies_at_entry = value_type != Primitive::kPrimLong && !is_byte_type;
     locations->SetInAt(0, Location::RequiresRegister(), dies_at_entry);
     locations->SetInAt(
         1, Location::RegisterOrConstant(instruction->InputAt(1)), dies_at_entry);
-    if (value_type == Primitive::kPrimBoolean || value_type == Primitive::kPrimByte) {
+    if (is_byte_type) {
       // Ensure the value is in a byte register.
-      locations->SetInAt(2, X86CpuLocation(EAX), dies_at_entry);
+      locations->SetInAt(2, Location::ByteRegisterOrConstant(
+          X86ManagedRegister::FromCpuRegister(EAX), instruction->InputAt(2)), dies_at_entry);
     } else {
-      locations->SetInAt(2, Location::RequiresRegister(), dies_at_entry);
+      locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)), dies_at_entry);
     }
   }
 }
@@ -1454,18 +1521,29 @@
   LocationSummary* locations = instruction->GetLocations();
   Register obj = locations->InAt(0).AsX86().AsCpuRegister();
   Location index = locations->InAt(1);
+  Location value = locations->InAt(2);
   Primitive::Type value_type = instruction->GetComponentType();
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
-      ByteRegister value = locations->InAt(2).AsX86().AsByteRegister();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ movb(Address(obj, offset), value);
+        if (value.IsRegister()) {
+          __ movb(Address(obj, offset), value.AsX86().AsByteRegister());
+        } else {
+          __ movb(Address(obj, offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       } else {
-        __ movb(Address(obj, index.AsX86().AsCpuRegister(), TIMES_1, data_offset), value);
+        if (value.IsRegister()) {
+          __ movb(Address(obj, index.AsX86().AsCpuRegister(), TIMES_1, data_offset),
+                  value.AsX86().AsByteRegister());
+        } else {
+          __ movb(Address(obj, index.AsX86().AsCpuRegister(), TIMES_1, data_offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       }
       break;
     }
@@ -1473,24 +1551,43 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value();
-      Register value = locations->InAt(2).AsX86().AsCpuRegister();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ movw(Address(obj, offset), value);
+        if (value.IsRegister()) {
+          __ movw(Address(obj, offset), value.AsX86().AsCpuRegister());
+        } else {
+          __ movw(Address(obj, offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       } else {
-        __ movw(Address(obj, index.AsX86().AsCpuRegister(), TIMES_2, data_offset), value);
+        if (value.IsRegister()) {
+          __ movw(Address(obj, index.AsX86().AsCpuRegister(), TIMES_2, data_offset),
+                  value.AsX86().AsCpuRegister());
+        } else {
+          __ movw(Address(obj, index.AsX86().AsCpuRegister(), TIMES_2, data_offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       }
       break;
     }
 
     case Primitive::kPrimInt: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      Register value = locations->InAt(2).AsX86().AsCpuRegister();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ movl(Address(obj, offset), value);
+        if (value.IsRegister()) {
+          __ movl(Address(obj, offset), value.AsX86().AsCpuRegister());
+        } else {
+          __ movl(Address(obj, offset), Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       } else {
-        __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_4, data_offset), value);
+        if (value.IsRegister()) {
+          __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_4, data_offset),
+                  value.AsX86().AsCpuRegister());
+        } else {
+          __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_4, data_offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       }
       break;
     }
@@ -1504,16 +1601,29 @@
 
     case Primitive::kPrimLong: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value();
-      X86ManagedRegister value = locations->InAt(2).AsX86();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ movl(Address(obj, offset), value.AsRegisterPairLow());
-        __ movl(Address(obj, offset + kX86WordSize), value.AsRegisterPairHigh());
+        if (value.IsRegister()) {
+          __ movl(Address(obj, offset), value.AsX86().AsRegisterPairLow());
+          __ movl(Address(obj, offset + kX86WordSize), value.AsX86().AsRegisterPairHigh());
+        } else {
+          int64_t val = value.GetConstant()->AsLongConstant()->GetValue();
+          __ movl(Address(obj, offset), Immediate(Low32Bits(val)));
+          __ movl(Address(obj, offset + kX86WordSize), Immediate(High32Bits(val)));
+        }
       } else {
-        __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_8, data_offset),
-                value.AsRegisterPairLow());
-        __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_8, data_offset + kX86WordSize),
-                value.AsRegisterPairHigh());
+        if (value.IsRegister()) {
+          __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_8, data_offset),
+                  value.AsX86().AsRegisterPairLow());
+          __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_8, data_offset + kX86WordSize),
+                  value.AsX86().AsRegisterPairHigh());
+        } else {
+          int64_t val = value.GetConstant()->AsLongConstant()->GetValue();
+          __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_8, data_offset),
+                  Immediate(Low32Bits(val)));
+          __ movl(Address(obj, index.AsX86().AsCpuRegister(), TIMES_8, data_offset + kX86WordSize),
+                  Immediate(High32Bits(val)));
+        }
       }
       break;
     }
@@ -1547,8 +1657,9 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  // TODO: Have a normalization phase that makes this instruction never used.
-  locations->SetOut(Location::SameAsFirstInput());
+  if (instruction->HasUses()) {
+    locations->SetOut(Location::SameAsFirstInput());
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitBoundsCheck(HBoundsCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 23145bf..3e2ca90 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -32,11 +32,16 @@
 static constexpr Register kParameterCoreRegisters[] = { ECX, EDX, EBX };
 static constexpr RegisterPair kParameterCorePairRegisters[] = { ECX_EDX, EDX_EBX };
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
+static constexpr XmmRegister kParameterFpuRegisters[] = { };
+static constexpr size_t kParameterFpuRegistersLength = 0;
 
-class InvokeDexCallingConvention : public CallingConvention<Register> {
+class InvokeDexCallingConvention : public CallingConvention<Register, XmmRegister> {
  public:
-  InvokeDexCallingConvention()
-      : CallingConvention(kParameterCoreRegisters, kParameterCoreRegistersLength) {}
+  InvokeDexCallingConvention() : CallingConvention(
+      kParameterCoreRegisters,
+      kParameterCoreRegistersLength,
+      kParameterFpuRegisters,
+      kParameterFpuRegistersLength) {}
 
   RegisterPair GetRegisterPairAt(size_t argument_index) {
     DCHECK_LT(argument_index + 1, GetNumberOfRegisters());
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index e0e0c79..5d04ca6 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -51,12 +51,16 @@
 static constexpr Register kRuntimeParameterCoreRegisters[] = { RDI, RSI, RDX };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
+static constexpr FloatRegister kRuntimeParameterFpuRegisters[] = { };
+static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
 
-class InvokeRuntimeCallingConvention : public CallingConvention<Register> {
+class InvokeRuntimeCallingConvention : public CallingConvention<Register, FloatRegister> {
  public:
   InvokeRuntimeCallingConvention()
       : CallingConvention(kRuntimeParameterCoreRegisters,
-                          kRuntimeParameterCoreRegistersLength) {}
+                          kRuntimeParameterCoreRegistersLength,
+                          kRuntimeParameterFpuRegisters,
+                          kRuntimeParameterFpuRegistersLength) {}
 
  private:
   DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
@@ -219,8 +223,11 @@
     }
 
     case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << type;
+    case Primitive::kPrimDouble: {
+      size_t reg = AllocateFreeRegisterInternal(
+          blocked_registers + kNumberOfCpuRegisters, kNumberOfFloatRegisters);
+      return X86_64ManagedRegister::FromXmmRegister(static_cast<FloatRegister>(reg));
+    }
 
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << type;
@@ -243,6 +250,12 @@
   blocked_registers[R13] = true;
   blocked_registers[R14] = true;
   blocked_registers[R15] = true;
+
+  bool* blocked_xmm_registers = blocked_registers + kNumberOfCpuRegisters;
+  blocked_xmm_registers[XMM12] = true;
+  blocked_xmm_registers[XMM13] = true;
+  blocked_xmm_registers[XMM14] = true;
+  blocked_xmm_registers[XMM15] = true;
 }
 
 void CodeGeneratorX86_64::GenerateFrameEntry() {
@@ -291,16 +304,14 @@
 Location CodeGeneratorX86_64::GetStackLocation(HLoadLocal* load) const {
   switch (load->GetType()) {
     case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
       return Location::DoubleStackSlot(GetStackSlot(load->GetLocal()));
       break;
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
-      return Location::StackSlot(GetStackSlot(load->GetLocal()));
-
     case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented type " << load->GetType();
+      return Location::StackSlot(GetStackSlot(load->GetLocal()));
 
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -321,15 +332,36 @@
   if (destination.IsRegister()) {
     if (source.IsRegister()) {
       __ movq(destination.AsX86_64().AsCpuRegister(), source.AsX86_64().AsCpuRegister());
+    } else if (source.IsFpuRegister()) {
+      __ movd(destination.AsX86_64().AsCpuRegister(), source.AsX86_64().AsXmmRegister());
     } else if (source.IsStackSlot()) {
-      __ movl(destination.AsX86_64().AsCpuRegister(), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movl(destination.AsX86_64().AsCpuRegister(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ movq(destination.AsX86_64().AsCpuRegister(), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movq(destination.AsX86_64().AsCpuRegister(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
+    }
+  } else if (destination.IsFpuRegister()) {
+    if (source.IsRegister()) {
+      __ movd(destination.AsX86_64().AsXmmRegister(), source.AsX86_64().AsCpuRegister());
+    } else if (source.IsFpuRegister()) {
+      __ movaps(destination.AsX86_64().AsXmmRegister(), source.AsX86_64().AsXmmRegister());
+    } else if (source.IsStackSlot()) {
+      __ movss(destination.AsX86_64().AsXmmRegister(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      __ movsd(destination.AsX86_64().AsXmmRegister(),
+               Address(CpuRegister(RSP), source.GetStackIndex()));
     }
   } else if (destination.IsStackSlot()) {
     if (source.IsRegister()) {
-      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsX86_64().AsCpuRegister());
+      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()),
+              source.AsX86_64().AsCpuRegister());
+    } else if (source.IsFpuRegister()) {
+      __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
+               source.AsX86_64().AsXmmRegister());
     } else {
       DCHECK(source.IsStackSlot());
       __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -338,7 +370,11 @@
   } else {
     DCHECK(destination.IsDoubleStackSlot());
     if (source.IsRegister()) {
-      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsX86_64().AsCpuRegister());
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()),
+              source.AsX86_64().AsCpuRegister());
+    } else if (source.IsFpuRegister()) {
+      __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
+               source.AsX86_64().AsXmmRegister());
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -373,15 +409,17 @@
       case Primitive::kPrimShort:
       case Primitive::kPrimInt:
       case Primitive::kPrimNot:
+      case Primitive::kPrimFloat:
         Move(location, Location::StackSlot(GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
         break;
 
       case Primitive::kPrimLong:
+      case Primitive::kPrimDouble:
         Move(location, Location::DoubleStackSlot(GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
         break;
 
       default:
-        LOG(FATAL) << "Unimplemented local type " << instruction->GetType();
+        LOG(FATAL) << "Unexpected local type " << instruction->GetType();
     }
   } else {
     DCHECK((instruction->GetNext() == move_for) || instruction->GetNext()->IsTemporary());
@@ -393,11 +431,13 @@
       case Primitive::kPrimInt:
       case Primitive::kPrimNot:
       case Primitive::kPrimLong:
+      case Primitive::kPrimFloat:
+      case Primitive::kPrimDouble:
         Move(location, instruction->GetLocations()->Out());
         break;
 
       default:
-        LOG(FATAL) << "Unimplemented type " << instruction->GetType();
+        LOG(FATAL) << "Unexpected type " << instruction->GetType();
     }
   }
 }
@@ -443,21 +483,17 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(if_instr, LocationSummary::kNoCall);
   HInstruction* cond = if_instr->InputAt(0);
-  DCHECK(cond->IsCondition());
-  HCondition* condition = cond->AsCondition();
-  if (condition->NeedsMaterialization()) {
+  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
     locations->SetInAt(0, Location::Any(), Location::kDiesAtEntry);
   }
 }
 
 void InstructionCodeGeneratorX86_64::VisitIf(HIf* if_instr) {
   HInstruction* cond = if_instr->InputAt(0);
-  DCHECK(cond->IsCondition());
-  HCondition* condition = cond->AsCondition();
-  if (condition->NeedsMaterialization()) {
+  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
     // Moves do not affect the eflags register, so if the condition is evaluated
     // just before the if, we don't need to evaluate it again.
-    if (!condition->IsBeforeWhenDisregardMoves(if_instr)) {
+    if (!cond->IsCondition() || !cond->AsCondition()->IsBeforeWhenDisregardMoves(if_instr)) {
       // Materialized condition, compare against 0.
       Location lhs = if_instr->GetLocations()->InAt(0);
       if (lhs.IsRegister()) {
@@ -468,8 +504,8 @@
     }
     __ j(kNotEqual, codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
   } else {
-    Location lhs = condition->GetLocations()->InAt(0);
-    Location rhs = condition->GetLocations()->InAt(1);
+    Location lhs = cond->GetLocations()->InAt(0);
+    Location rhs = cond->GetLocations()->InAt(1);
     if (rhs.IsRegister()) {
       __ cmpl(lhs.AsX86_64().AsCpuRegister(), rhs.AsX86_64().AsCpuRegister());
     } else if (rhs.IsConstant()) {
@@ -478,7 +514,7 @@
     } else {
       __ cmpl(lhs.AsX86_64().AsCpuRegister(), Address(CpuRegister(RSP), rhs.GetStackIndex()));
     }
-    __ j(X86_64Condition(condition->GetCondition()),
+    __ j(X86_64Condition(cond->AsCondition()->GetCondition()),
          codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
   }
   if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfFalseSuccessor())) {
@@ -512,15 +548,17 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
+    case Primitive::kPrimFloat:
       locations->SetInAt(1, Location::StackSlot(codegen_->GetStackSlot(store->GetLocal())));
       break;
 
     case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
       locations->SetInAt(1, Location::DoubleStackSlot(codegen_->GetStackSlot(store->GetLocal())));
       break;
 
     default:
-      LOG(FATAL) << "Unimplemented local type " << store->InputAt(1)->GetType();
+      LOG(FATAL) << "Unexpected local type " << store->InputAt(1)->GetType();
   }
 }
 
@@ -679,8 +717,14 @@
       locations->SetInAt(0, X86_64CpuLocation(RAX));
       break;
 
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0,
+          Location::FpuRegisterLocation(X86_64ManagedRegister::FromXmmRegister(XMM0)));
+      break;
+
     default:
-      LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+      LOG(FATAL) << "Unexpected return type " << ret->InputAt(0)->GetType();
   }
 }
 
@@ -697,8 +741,14 @@
         DCHECK_EQ(ret->GetLocations()->InAt(0).AsX86_64().AsCpuRegister().AsRegister(), RAX);
         break;
 
+      case Primitive::kPrimFloat:
+      case Primitive::kPrimDouble:
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsX86_64().AsXmmRegister().AsFloatRegister(),
+                  XMM0);
+        break;
+
       default:
-        LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+        LOG(FATAL) << "Unexpected return type " << ret->InputAt(0)->GetType();
     }
   }
   codegen_->GenerateFrameExit();
@@ -734,10 +784,27 @@
       }
     }
 
-    case Primitive::kPrimDouble:
-    case Primitive::kPrimFloat:
-      LOG(FATAL) << "Unimplemented parameter type " << type;
-      break;
+    case Primitive::kPrimFloat: {
+      uint32_t index = fp_index_++;
+      stack_index_++;
+      if (index < calling_convention.GetNumberOfFpuRegisters()) {
+        return Location::FpuRegisterLocation(X86_64ManagedRegister::FromXmmRegister(
+            calling_convention.GetFpuRegisterAt(index)));
+      } else {
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 1));
+      }
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t index = fp_index_++;
+      stack_index_ += 2;
+      if (index < calling_convention.GetNumberOfFpuRegisters()) {
+        return Location::FpuRegisterLocation(X86_64ManagedRegister::FromXmmRegister(
+            calling_convention.GetFpuRegisterAt(index)));
+      } else {
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 2));
+      }
+    }
 
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unexpected parameter type " << type;
@@ -807,7 +874,8 @@
 
     case Primitive::kPrimDouble:
     case Primitive::kPrimFloat:
-      LOG(FATAL) << "Unimplemented return type " << invoke->GetType();
+      locations->SetOut(
+          Location::FpuRegisterLocation(X86_64ManagedRegister::FromXmmRegister(XMM0)));
       break;
   }
 }
@@ -845,6 +913,7 @@
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
+
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RequiresRegister());
@@ -852,52 +921,67 @@
       break;
     }
 
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+    case Primitive::kPrimDouble:
+    case Primitive::kPrimFloat: {
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::SameAsFirstInput());
       break;
+    }
 
     default:
-      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
   }
 }
 
 void InstructionCodeGeneratorX86_64::VisitAdd(HAdd* add) {
   LocationSummary* locations = add->GetLocations();
-  DCHECK_EQ(locations->InAt(0).AsX86_64().AsCpuRegister().AsRegister(),
-            locations->Out().AsX86_64().AsCpuRegister().AsRegister());
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+
+  DCHECK(first.Equals(locations->Out()));
   switch (add->GetResultType()) {
     case Primitive::kPrimInt: {
-      if (locations->InAt(1).IsRegister()) {
-        __ addl(locations->InAt(0).AsX86_64().AsCpuRegister(),
-                locations->InAt(1).AsX86_64().AsCpuRegister());
-      } else if (locations->InAt(1).IsConstant()) {
-        HConstant* instruction = locations->InAt(1).GetConstant();
+      if (second.IsRegister()) {
+        __ addl(first.AsX86_64().AsCpuRegister(), second.AsX86_64().AsCpuRegister());
+      } else if (second.IsConstant()) {
+        HConstant* instruction = second.GetConstant();
         Immediate imm(instruction->AsIntConstant()->GetValue());
-        __ addl(locations->InAt(0).AsX86_64().AsCpuRegister(), imm);
+        __ addl(first.AsX86_64().AsCpuRegister(), imm);
       } else {
-        __ addl(locations->InAt(0).AsX86_64().AsCpuRegister(),
-                Address(CpuRegister(RSP), locations->InAt(1).GetStackIndex()));
+        __ addl(first.AsX86_64().AsCpuRegister(),
+                Address(CpuRegister(RSP), second.GetStackIndex()));
       }
       break;
     }
+
     case Primitive::kPrimLong: {
-      __ addq(locations->InAt(0).AsX86_64().AsCpuRegister(),
-              locations->InAt(1).AsX86_64().AsCpuRegister());
+      __ addq(first.AsX86_64().AsCpuRegister(), second.AsX86_64().AsCpuRegister());
       break;
     }
 
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+    case Primitive::kPrimFloat: {
+      if (second.IsFpuRegister()) {
+        __ addss(first.AsX86_64().AsXmmRegister(), second.AsX86_64().AsXmmRegister());
+      } else {
+        __ addss(first.AsX86_64().AsXmmRegister(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
+    }
+
+    case Primitive::kPrimDouble: {
+      if (second.IsFpuRegister()) {
+        __ addsd(first.AsX86_64().AsXmmRegister(), second.AsX86_64().AsXmmRegister());
+      } else {
+        __ addsd(first.AsX86_64().AsXmmRegister(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
+      break;
+    }
 
     default:
-      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
   }
 }
 
@@ -1149,8 +1233,9 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::Any());
-  // TODO: Have a normalization phase that makes this instruction never used.
-  locations->SetOut(Location::SameAsFirstInput());
+  if (instruction->HasUses()) {
+    locations->SetOut(Location::SameAsFirstInput());
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitNullCheck(HNullCheck* instruction) {
@@ -1159,13 +1244,16 @@
 
   LocationSummary* locations = instruction->GetLocations();
   Location obj = locations->InAt(0);
-  DCHECK(obj.Equals(locations->Out()));
 
   if (obj.IsRegister()) {
     __ cmpl(obj.AsX86_64().AsCpuRegister(), Immediate(0));
-  } else {
-    DCHECK(locations->InAt(0).IsStackSlot());
+  } else if (obj.IsStackSlot()) {
     __ cmpl(Address(CpuRegister(RSP), obj.GetStackIndex()), Immediate(0));
+  } else {
+    DCHECK(obj.IsConstant()) << obj;
+    DCHECK_EQ(obj.GetConstant()->AsIntConstant()->GetValue(), 0);
+    __ jmp(slow_path->GetEntryLabel());
+    return;
   }
   __ j(kEqual, slow_path->GetEntryLabel());
 }
@@ -1283,6 +1371,11 @@
     locations->SetInAt(
         1, Location::RegisterOrConstant(instruction->InputAt(1)), Location::kDiesAtEntry);
     locations->SetInAt(2, Location::RequiresRegister(), Location::kDiesAtEntry);
+    if (value_type == Primitive::kPrimLong) {
+      locations->SetInAt(2, Location::RequiresRegister(), Location::kDiesAtEntry);
+    } else {
+      locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)), Location::kDiesAtEntry);
+    }
   }
 }
 
@@ -1290,18 +1383,28 @@
   LocationSummary* locations = instruction->GetLocations();
   CpuRegister obj = locations->InAt(0).AsX86_64().AsCpuRegister();
   Location index = locations->InAt(1);
+  Location value = locations->InAt(2);
   Primitive::Type value_type = instruction->GetComponentType();
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
-      CpuRegister value = locations->InAt(2).AsX86_64().AsCpuRegister();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ movb(Address(obj, offset), value);
+        if (value.IsRegister()) {
+          __ movb(Address(obj, offset), value.AsX86_64().AsCpuRegister());
+        } else {
+          __ movb(Address(obj, offset), Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       } else {
-        __ movb(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_1, data_offset), value);
+        if (value.IsRegister()) {
+          __ movb(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_1, data_offset),
+                  value.AsX86_64().AsCpuRegister());
+        } else {
+          __ movb(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_1, data_offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       }
       break;
     }
@@ -1309,24 +1412,42 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value();
-      CpuRegister value = locations->InAt(2).AsX86_64().AsCpuRegister();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ movw(Address(obj, offset), value);
+        if (value.IsRegister()) {
+          __ movw(Address(obj, offset), value.AsX86_64().AsCpuRegister());
+        } else {
+          __ movw(Address(obj, offset), Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       } else {
-        __ movw(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_2, data_offset), value);
+        if (value.IsRegister()) {
+          __ movw(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_2, data_offset),
+                  value.AsX86_64().AsCpuRegister());
+        } else {
+          __ movw(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_2, data_offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       }
       break;
     }
 
     case Primitive::kPrimInt: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      CpuRegister value = locations->InAt(2).AsX86_64().AsCpuRegister();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ movl(Address(obj, offset), value);
+        if (value.IsRegister()) {
+          __ movl(Address(obj, offset), value.AsX86_64().AsCpuRegister());
+        } else {
+          __ movl(Address(obj, offset), Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       } else {
-        __ movl(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_4, data_offset), value);
+        if (value.IsRegister()) {
+          __ movl(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_4, data_offset),
+                  value.AsX86_64().AsCpuRegister());
+        } else {
+          __ movl(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_4, data_offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+        }
       }
       break;
     }
@@ -1340,12 +1461,14 @@
 
     case Primitive::kPrimLong: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value();
-      CpuRegister value = locations->InAt(2).AsX86_64().AsCpuRegister();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ movq(Address(obj, offset), value);
+        DCHECK(value.IsRegister());
+        __ movq(Address(obj, offset), value.AsX86_64().AsCpuRegister());
       } else {
-        __ movq(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_8, data_offset), value);
+        DCHECK(value.IsRegister());
+        __ movq(Address(obj, index.AsX86_64().AsCpuRegister(), TIMES_8, data_offset),
+                value.AsX86_64().AsCpuRegister());
       }
       break;
     }
@@ -1379,8 +1502,9 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  // TODO: Have a normalization phase that makes this instruction never used.
-  locations->SetOut(Location::SameAsFirstInput());
+  if (instruction->HasUses()) {
+    locations->SetOut(Location::SameAsFirstInput());
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitBoundsCheck(HBoundsCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index a299cf6..c81f785 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -28,13 +28,19 @@
 static constexpr size_t kX86_64WordSize = 8;
 
 static constexpr Register kParameterCoreRegisters[] = { RSI, RDX, RCX, R8, R9 };
+static constexpr FloatRegister kParameterFloatRegisters[] =
+    { XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7 };
 
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
+static constexpr size_t kParameterFloatRegistersLength = arraysize(kParameterFloatRegisters);
 
-class InvokeDexCallingConvention : public CallingConvention<Register> {
+class InvokeDexCallingConvention : public CallingConvention<Register, FloatRegister> {
  public:
-  InvokeDexCallingConvention()
-      : CallingConvention(kParameterCoreRegisters, kParameterCoreRegistersLength) {}
+  InvokeDexCallingConvention() : CallingConvention(
+      kParameterCoreRegisters,
+      kParameterCoreRegistersLength,
+      kParameterFloatRegisters,
+      kParameterFloatRegistersLength) {}
 
  private:
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
@@ -42,13 +48,17 @@
 
 class InvokeDexCallingConventionVisitor {
  public:
-  InvokeDexCallingConventionVisitor() : gp_index_(0), stack_index_(0) {}
+  InvokeDexCallingConventionVisitor() : gp_index_(0), fp_index_(0), stack_index_(0) {}
 
   Location GetNextLocation(Primitive::Type type);
 
  private:
   InvokeDexCallingConvention calling_convention;
+  // The current index for cpu registers.
   uint32_t gp_index_;
+  // The current index for fpu registers.
+  uint32_t fp_index_;
+  // The current stack index.
   uint32_t stack_index_;
 
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
diff --git a/compiler/optimizing/constant_propagation_test.cc b/compiler/optimizing/constant_propagation_test.cc
index 5c8c709..342777a 100644
--- a/compiler/optimizing/constant_propagation_test.cc
+++ b/compiler/optimizing/constant_propagation_test.cc
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <functional>
+
 #include "constant_propagation.h"
 #include "dead_code_elimination.h"
 #include "pretty_printer.h"
@@ -27,10 +29,12 @@
 static void TestCode(const uint16_t* data,
                      const std::string& expected_before,
                      const std::string& expected_after_cp,
-                     const std::string& expected_after_dce) {
+                     const std::string& expected_after_dce,
+                     std::function<void(HGraph*)> check_after_cp,
+                     Primitive::Type return_type = Primitive::kPrimInt) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  HGraph* graph = CreateCFG(&allocator, data);
+  HGraph* graph = CreateCFG(&allocator, data, return_type);
   ASSERT_NE(graph, nullptr);
 
   graph->BuildDominatorTree();
@@ -48,6 +52,8 @@
   std::string actual_after_cp = printer_after_cp.str();
   ASSERT_EQ(expected_after_cp, actual_after_cp);
 
+  check_after_cp(graph);
+
   DeadCodeElimination(graph).Run();
 
   StringPrettyPrinter printer_after_dce(graph);
@@ -100,6 +106,13 @@
   };
   std::string expected_after_cp = Patch(expected_before, expected_cp_diff);
 
+  // Check the value of the computed constant.
+  auto check_after_cp = [](HGraph* graph) {
+    HInstruction* inst = graph->GetBlock(1)->GetFirstInstruction();
+    ASSERT_TRUE(inst->IsIntConstant());
+    ASSERT_EQ(inst->AsIntConstant()->GetValue(), 3);
+  };
+
   // Expected difference after dead code elimination.
   diff_t expected_dce_diff = {
     { "  3: IntConstant\n", removed },
@@ -107,7 +120,11 @@
   };
   std::string expected_after_dce = Patch(expected_after_cp, expected_dce_diff);
 
-  TestCode(data, expected_before, expected_after_cp, expected_after_dce);
+  TestCode(data,
+           expected_before,
+           expected_after_cp,
+           expected_after_dce,
+           check_after_cp);
 }
 
 /**
@@ -165,6 +182,19 @@
   };
   std::string expected_after_cp = Patch(expected_before, expected_cp_diff);
 
+  // Check the values of the computed constants.
+  auto check_after_cp = [](HGraph* graph) {
+    HInstruction* inst1 = graph->GetBlock(1)->GetFirstInstruction();
+    ASSERT_TRUE(inst1->IsIntConstant());
+    ASSERT_EQ(inst1->AsIntConstant()->GetValue(), 3);
+    HInstruction* inst2 = inst1->GetNext();
+    ASSERT_TRUE(inst2->IsIntConstant());
+    ASSERT_EQ(inst2->AsIntConstant()->GetValue(), 7);
+    HInstruction* inst3 = inst2->GetNext();
+    ASSERT_TRUE(inst3->IsIntConstant());
+    ASSERT_EQ(inst3->AsIntConstant()->GetValue(), 10);
+  };
+
   // Expected difference after dead code elimination.
   diff_t expected_dce_diff = {
     { "  3: IntConstant\n",  removed },
@@ -176,7 +206,11 @@
   };
   std::string expected_after_dce = Patch(expected_after_cp, expected_dce_diff);
 
-  TestCode(data, expected_before, expected_after_cp, expected_after_dce);
+  TestCode(data,
+           expected_before,
+           expected_after_cp,
+           expected_after_dce,
+           check_after_cp);
 }
 
 /**
@@ -218,6 +252,13 @@
   };
   std::string expected_after_cp = Patch(expected_before, expected_cp_diff);
 
+  // Check the value of the computed constant.
+  auto check_after_cp = [](HGraph* graph) {
+    HInstruction* inst = graph->GetBlock(1)->GetFirstInstruction();
+    ASSERT_TRUE(inst->IsIntConstant());
+    ASSERT_EQ(inst->AsIntConstant()->GetValue(), 1);
+  };
+
   // Expected difference after dead code elimination.
   diff_t expected_dce_diff = {
     { "  3: IntConstant\n", removed },
@@ -225,7 +266,11 @@
   };
   std::string expected_after_dce = Patch(expected_after_cp, expected_dce_diff);
 
-  TestCode(data, expected_before, expected_after_cp, expected_after_dce);
+  TestCode(data,
+           expected_before,
+           expected_after_cp,
+           expected_after_dce,
+           check_after_cp);
 }
 
 #define SIX_REGISTERS_CODE_ITEM(...)                                     \
@@ -272,6 +317,13 @@
   };
   std::string expected_after_cp = Patch(expected_before, expected_cp_diff);
 
+  // Check the value of the computed constant.
+  auto check_after_cp = [](HGraph* graph) {
+    HInstruction* inst = graph->GetBlock(1)->GetFirstInstruction();
+    ASSERT_TRUE(inst->IsLongConstant());
+    ASSERT_EQ(inst->AsLongConstant()->GetValue(), 3);
+  };
+
   // Expected difference after dead code elimination.
   diff_t expected_dce_diff = {
     { "  6: LongConstant\n", removed },
@@ -279,7 +331,12 @@
   };
   std::string expected_after_dce = Patch(expected_after_cp, expected_dce_diff);
 
-  TestCode(data, expected_before, expected_after_cp, expected_after_dce);
+  TestCode(data,
+           expected_before,
+           expected_after_cp,
+           expected_after_dce,
+           check_after_cp,
+           Primitive::kPrimLong);
 }
 
 /**
@@ -323,6 +380,13 @@
   };
   std::string expected_after_cp = Patch(expected_before, expected_cp_diff);
 
+  // Check the value of the computed constant.
+  auto check_after_cp = [](HGraph* graph) {
+    HInstruction* inst = graph->GetBlock(1)->GetFirstInstruction();
+    ASSERT_TRUE(inst->IsLongConstant());
+    ASSERT_EQ(inst->AsLongConstant()->GetValue(), 1);
+  };
+
   // Expected difference after dead code elimination.
   diff_t expected_dce_diff = {
     { "  6: LongConstant\n", removed },
@@ -330,7 +394,12 @@
   };
   std::string expected_after_dce = Patch(expected_after_cp, expected_dce_diff);
 
-  TestCode(data, expected_before, expected_after_cp, expected_after_dce);
+  TestCode(data,
+           expected_before,
+           expected_after_cp,
+           expected_after_dce,
+           check_after_cp,
+           Primitive::kPrimLong);
 }
 
 /**
@@ -370,26 +439,26 @@
 
   std::string expected_before =
     "BasicBlock 0, succ: 1\n"
-    "  3: IntConstant [9]\n"
-    "  5: IntConstant [9]\n"
-    "  13: IntConstant [14]\n"
-    "  18: IntConstant [19]\n"
-    "  24: IntConstant [25]\n"
+    "  3: IntConstant [9]\n"            // v0 <- 0
+    "  5: IntConstant [9]\n"            // v1 <- 1
+    "  13: IntConstant [14]\n"          // const 3
+    "  18: IntConstant [19]\n"          // const 2
+    "  24: IntConstant [25]\n"          // const 4
     "  30: SuspendCheck\n"
     "  31: Goto 1\n"
     "BasicBlock 1, pred: 0, succ: 3\n"
-    "  9: Add(3, 5) [19]\n"
-    "  11: Goto 3\n"
-    "BasicBlock 2, pred: 3, succ: 4\n"
-    "  14: Add(19, 13) [25]\n"
-    "  16: Goto 4\n"
-    "BasicBlock 3, pred: 1, succ: 2\n"
-    "  19: Add(9, 18) [14]\n"
+    "  9: Add(3, 5) [19]\n"             // v2 <- v0 + v1 = 0 + 1 = 1
+    "  11: Goto 3\n"                    // goto L2
+    "BasicBlock 2, pred: 3, succ: 4\n"  // L1:
+    "  14: Add(19, 13) [25]\n"          // v1 <- v0 + 3 = 3 + 3 = 6
+    "  16: Goto 4\n"                    // goto L3
+    "BasicBlock 3, pred: 1, succ: 2\n"  // L2:
+    "  19: Add(9, 18) [14]\n"           // v0 <- v2 + 2 = 1 + 2 = 3
     "  21: SuspendCheck\n"
-    "  22: Goto 2\n"
-    "BasicBlock 4, pred: 2, succ: 5\n"
-    "  25: Add(14, 24) [28]\n"
-    "  28: Return(25)\n"
+    "  22: Goto 2\n"                    // goto L1
+    "BasicBlock 4, pred: 2, succ: 5\n"  // L3:
+    "  25: Add(14, 24) [28]\n"          // v2 <- v1 + 4 = 6 + 4 = 10
+    "  28: Return(25)\n"                // return v2
     "BasicBlock 5, pred: 4\n"
     "  29: Exit\n";
 
@@ -408,6 +477,22 @@
   };
   std::string expected_after_cp = Patch(expected_before, expected_cp_diff);
 
+  // Check the values of the computed constants.
+  auto check_after_cp = [](HGraph* graph) {
+    HInstruction* inst1 = graph->GetBlock(1)->GetFirstInstruction();
+    ASSERT_TRUE(inst1->IsIntConstant());
+    ASSERT_EQ(inst1->AsIntConstant()->GetValue(), 1);
+    HInstruction* inst2 = graph->GetBlock(2)->GetFirstInstruction();
+    ASSERT_TRUE(inst2->IsIntConstant());
+    ASSERT_EQ(inst2->AsIntConstant()->GetValue(), 6);
+    HInstruction* inst3 = graph->GetBlock(3)->GetFirstInstruction();
+    ASSERT_TRUE(inst3->IsIntConstant());
+    ASSERT_EQ(inst3->AsIntConstant()->GetValue(), 3);
+    HInstruction* inst4 = graph->GetBlock(4)->GetFirstInstruction();
+    ASSERT_TRUE(inst4->IsIntConstant());
+    ASSERT_EQ(inst4->AsIntConstant()->GetValue(), 10);
+  };
+
   // Expected difference after dead code elimination.
   diff_t expected_dce_diff = {
     { "  3: IntConstant\n",     removed },
@@ -418,7 +503,11 @@
   };
   std::string expected_after_dce = Patch(expected_after_cp, expected_dce_diff);
 
-  TestCode(data, expected_before, expected_after_cp, expected_after_dce);
+  TestCode(data,
+           expected_before,
+           expected_after_cp,
+           expected_after_dce,
+           check_after_cp);
 }
 
 
@@ -473,6 +562,13 @@
   };
   std::string expected_after_cp = Patch(expected_before, expected_cp_diff);
 
+  // Check the values of the computed constants.
+  auto check_after_cp = [](HGraph* graph) {
+    HInstruction* inst = graph->GetBlock(1)->GetFirstInstruction();
+    ASSERT_TRUE(inst->IsIntConstant());
+    ASSERT_EQ(inst->AsIntConstant()->GetValue(), 1);
+  };
+
   // Expected difference after dead code elimination.
   diff_t expected_dce_diff = {
     { "  3: IntConstant [15, 22]\n", "  3: IntConstant [22]\n" },
@@ -481,7 +577,11 @@
   };
   std::string expected_after_dce = Patch(expected_after_cp, expected_dce_diff);
 
-  TestCode(data, expected_before, expected_after_cp, expected_after_dce);
+  TestCode(data,
+           expected_before,
+           expected_after_cp,
+           expected_after_dce,
+           check_after_cp);
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 0fb4737..686a0b0 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -158,6 +158,7 @@
       }
     }
     output_ << ")";
+    output_ << " (liveness: " << instruction->GetLifetimePosition() << ")";
   }
 
   void VisitInstruction(HInstruction* instruction) {
@@ -191,6 +192,7 @@
           DumpLocation(locations->Out(), instruction->GetType());
         }
       }
+      output_ << " (liveness: " << instruction->GetLifetimePosition() << ")";
     }
   }
 
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index a0de73d..2d9e35c 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -38,4 +38,21 @@
   block->RemoveInstruction(check);
 }
 
+void InstructionSimplifier::VisitEqual(HEqual* equal) {
+  HInstruction* input1 = equal->InputAt(0);
+  HInstruction* input2 = equal->InputAt(1);
+  if (input1->GetType() == Primitive::kPrimBoolean && input2->IsIntConstant()) {
+    if (input2->AsIntConstant()->GetValue() == 1) {
+      // Replace (bool_value == 1) with bool_value
+      equal->ReplaceWith(equal->InputAt(0));
+      equal->GetBlock()->RemoveInstruction(equal);
+    } else {
+      // Replace (bool_value == 0) with !bool_value
+      DCHECK_EQ(input2->AsIntConstant()->GetValue(), 0);
+      equal->GetBlock()->ReplaceAndRemoveInstructionWith(
+          equal, new (GetGraph()->GetArena()) HNot(input1));
+    }
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier.h b/compiler/optimizing/instruction_simplifier.h
index b2f3f52..d74b624 100644
--- a/compiler/optimizing/instruction_simplifier.h
+++ b/compiler/optimizing/instruction_simplifier.h
@@ -32,6 +32,7 @@
 
  private:
   virtual void VisitSuspendCheck(HSuspendCheck* check) OVERRIDE;
+  virtual void VisitEqual(HEqual* equal) OVERRIDE;
 };
 
 }  // namespace art
diff --git a/compiler/optimizing/locations.cc b/compiler/optimizing/locations.cc
index 114d69c..7b09241 100644
--- a/compiler/optimizing/locations.cc
+++ b/compiler/optimizing/locations.cc
@@ -55,4 +55,15 @@
       : Location::RequiresRegister();
 }
 
+Location Location::ByteRegisterOrConstant(ManagedRegister reg, HInstruction* instruction) {
+  return instruction->IsConstant()
+      ? Location::ConstantLocation(instruction->AsConstant())
+      : Location::RegisterLocation(reg);
+}
+
+std::ostream& operator<<(std::ostream& os, const Location& location) {
+  os << location.DebugString();
+  return os;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 8d0715a..5f85b6a 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -39,31 +39,39 @@
   enum Kind {
     kInvalid = 0,
     kConstant = 1,
-    kStackSlot = 2,  // Word size slot.
+    kStackSlot = 2,  // 32bit stack slot.
     kDoubleStackSlot = 3,  // 64bit stack slot.
-    kRegister = 4,
+
+    kRegister = 4,  // Core register.
+
+    // We do not use the value 5 because it conflicts with kLocationConstantMask.
+    kDoNotUse = 5,
+
+    kFpuRegister = 6,  // Floating point processor.
+
     // On 32bits architectures, quick can pass a long where the
     // low bits are in the last parameter register, and the high
     // bits are in a stack slot. The kQuickParameter kind is for
     // handling this special case.
-    kQuickParameter = 6,
+    kQuickParameter = 7,
 
     // Unallocated location represents a location that is not fixed and can be
     // allocated by a register allocator.  Each unallocated location has
     // a policy that specifies what kind of location is suitable. Payload
     // contains register allocation policy.
-    kUnallocated = 7,
+    kUnallocated = 8,
   };
 
   Location() : value_(kInvalid) {
-    // Verify that non-tagged location kinds do not interfere with kConstantTag.
-    COMPILE_ASSERT((kInvalid & kLocationTagMask) != kConstant, TagError);
-    COMPILE_ASSERT((kUnallocated & kLocationTagMask) != kConstant, TagError);
-    COMPILE_ASSERT((kStackSlot & kLocationTagMask) != kConstant, TagError);
-    COMPILE_ASSERT((kDoubleStackSlot & kLocationTagMask) != kConstant, TagError);
-    COMPILE_ASSERT((kRegister & kLocationTagMask) != kConstant, TagError);
-    COMPILE_ASSERT((kQuickParameter & kLocationTagMask) != kConstant, TagError);
-    COMPILE_ASSERT((kConstant & kLocationTagMask) == kConstant, TagError);
+    // Verify that non-constant location kinds do not interfere with kConstant.
+    COMPILE_ASSERT((kInvalid & kLocationConstantMask) != kConstant, TagError);
+    COMPILE_ASSERT((kUnallocated & kLocationConstantMask) != kConstant, TagError);
+    COMPILE_ASSERT((kStackSlot & kLocationConstantMask) != kConstant, TagError);
+    COMPILE_ASSERT((kDoubleStackSlot & kLocationConstantMask) != kConstant, TagError);
+    COMPILE_ASSERT((kRegister & kLocationConstantMask) != kConstant, TagError);
+    COMPILE_ASSERT((kQuickParameter & kLocationConstantMask) != kConstant, TagError);
+    COMPILE_ASSERT((kFpuRegister & kLocationConstantMask) != kConstant, TagError);
+    COMPILE_ASSERT((kConstant & kLocationConstantMask) == kConstant, TagError);
 
     DCHECK(!IsValid());
   }
@@ -76,7 +84,7 @@
   }
 
   bool IsConstant() const {
-    return (value_ & kLocationTagMask) == kConstant;
+    return (value_ & kLocationConstantMask) == kConstant;
   }
 
   static Location ConstantLocation(HConstant* constant) {
@@ -86,7 +94,7 @@
 
   HConstant* GetConstant() const {
     DCHECK(IsConstant());
-    return reinterpret_cast<HConstant*>(value_ & ~kLocationTagMask);
+    return reinterpret_cast<HConstant*>(value_ & ~kLocationConstantMask);
   }
 
   bool IsValid() const {
@@ -107,12 +115,20 @@
     return Location(kRegister, reg.RegId());
   }
 
+  static Location FpuRegisterLocation(ManagedRegister reg) {
+    return Location(kFpuRegister, reg.RegId());
+  }
+
   bool IsRegister() const {
     return GetKind() == kRegister;
   }
 
+  bool IsFpuRegister() const {
+    return GetKind() == kFpuRegister;
+  }
+
   ManagedRegister reg() const {
-    DCHECK(IsRegister());
+    DCHECK(IsRegister() || IsFpuRegister());
     return static_cast<ManagedRegister>(GetPayload());
   }
 
@@ -192,7 +208,11 @@
       case kQuickParameter: return "Q";
       case kUnallocated: return "U";
       case kConstant: return "C";
+      case kFpuRegister: return "F";
+      case kDoNotUse:
+        LOG(FATAL) << "Should not use this location kind";
     }
+    UNREACHABLE();
     return "?";
   }
 
@@ -200,6 +220,7 @@
   enum Policy {
     kAny,
     kRequiresRegister,
+    kRequiresFpuRegister,
     kSameAsFirstInput,
   };
 
@@ -220,7 +241,12 @@
     return UnallocatedLocation(kRequiresRegister);
   }
 
+  static Location RequiresFpuRegister() {
+    return UnallocatedLocation(kRequiresFpuRegister);
+  }
+
   static Location RegisterOrConstant(HInstruction* instruction);
+  static Location ByteRegisterOrConstant(ManagedRegister reg, HInstruction* instruction);
 
   // The location of the first input to the instruction will be
   // used to replace this unallocated location.
@@ -241,7 +267,7 @@
   // Number of bits required to encode Kind value.
   static constexpr uint32_t kBitsForKind = 4;
   static constexpr uint32_t kBitsForPayload = kWordSize * kBitsPerByte - kBitsForKind;
-  static constexpr uword kLocationTagMask = 0x3;
+  static constexpr uword kLocationConstantMask = 0x3;
 
   explicit Location(uword value) : value_(value) {}
 
@@ -422,6 +448,8 @@
   DISALLOW_COPY_AND_ASSIGN(LocationSummary);
 };
 
+std::ostream& operator<<(std::ostream& os, const Location& location);
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_LOCATIONS_H_
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 5c4ab8e..4cac319 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -308,6 +308,14 @@
   return false;
 }
 
+static void UpdateInputsUsers(HInstruction* instruction) {
+  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+    instruction->InputAt(i)->AddUseAt(instruction, i);
+  }
+  // Environment should be created later.
+  DCHECK(!instruction->HasEnvironment());
+}
+
 void HBasicBlock::InsertInstructionBefore(HInstruction* instruction, HInstruction* cursor) {
   DCHECK(cursor->AsPhi() == nullptr);
   DCHECK(instruction->AsPhi() == nullptr);
@@ -325,6 +333,7 @@
   }
   instruction->SetBlock(this);
   instruction->SetId(GetGraph()->GetNextInstructionId());
+  UpdateInputsUsers(instruction);
 }
 
 void HBasicBlock::ReplaceAndRemoveInstructionWith(HInstruction* initial,
@@ -342,6 +351,7 @@
   DCHECK_EQ(instruction->GetId(), -1);
   instruction->SetBlock(block);
   instruction->SetId(block->GetGraph()->GetNextInstructionId());
+  UpdateInputsUsers(instruction);
   instruction_list->AddInstruction(instruction);
 }
 
@@ -421,9 +431,6 @@
     instruction->previous_ = last_instruction_;
     last_instruction_ = instruction;
   }
-  for (size_t i = 0; i < instruction->InputCount(); i++) {
-    instruction->InputAt(i)->AddUseAt(instruction, i);
-  }
 }
 
 void HInstructionList::RemoveInstruction(HInstruction* instruction) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 3d65366..fc5b06d 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -93,6 +93,7 @@
 
   ArenaAllocator* GetArena() const { return arena_; }
   const GrowableArray<HBasicBlock*>& GetBlocks() const { return blocks_; }
+  HBasicBlock* GetBlock(size_t id) const { return blocks_.Get(id); }
 
   HBasicBlock* GetEntryBlock() const { return entry_block_; }
   HBasicBlock* GetExitBlock() const { return exit_block_; }
@@ -1143,8 +1144,12 @@
   HEqual(HInstruction* first, HInstruction* second)
       : HCondition(first, second) {}
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x == y; }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x == y; }
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    return x == y ? 1 : 0;
+  }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    return x == y ? 1 : 0;
+  }
 
   DECLARE_INSTRUCTION(Equal);
 
@@ -1161,8 +1166,12 @@
   HNotEqual(HInstruction* first, HInstruction* second)
       : HCondition(first, second) {}
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x != y; }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x != y; }
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    return x != y ? 1 : 0;
+  }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    return x != y ? 1 : 0;
+  }
 
   DECLARE_INSTRUCTION(NotEqual);
 
@@ -1179,8 +1188,12 @@
   HLessThan(HInstruction* first, HInstruction* second)
       : HCondition(first, second) {}
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x < y; }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x < y; }
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    return x < y ? 1 : 0;
+  }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    return x < y ? 1 : 0;
+  }
 
   DECLARE_INSTRUCTION(LessThan);
 
@@ -1197,8 +1210,12 @@
   HLessThanOrEqual(HInstruction* first, HInstruction* second)
       : HCondition(first, second) {}
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x <= y; }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x <= y; }
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    return x <= y ? 1 : 0;
+  }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    return x <= y ? 1 : 0;
+  }
 
   DECLARE_INSTRUCTION(LessThanOrEqual);
 
@@ -1215,8 +1232,12 @@
   HGreaterThan(HInstruction* first, HInstruction* second)
       : HCondition(first, second) {}
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x > y; }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x > y; }
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    return x > y ? 1 : 0;
+  }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    return x > y ? 1 : 0;
+  }
 
   DECLARE_INSTRUCTION(GreaterThan);
 
@@ -1233,8 +1254,12 @@
   HGreaterThanOrEqual(HInstruction* first, HInstruction* second)
       : HCondition(first, second) {}
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x >= y; }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x >= y; }
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    return x >= y ? 1 : 0;
+  }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    return x >= y ? 1 : 0;
+  }
 
   DECLARE_INSTRUCTION(GreaterThanOrEqual);
 
@@ -1257,13 +1282,13 @@
     DCHECK_EQ(type, second->GetType());
   }
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const {
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
     return
       x == y ? 0 :
       x > y ? 1 :
       -1;
   }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const {
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
     return
       x == y ? 0 :
       x > y ? 1 :
@@ -1490,8 +1515,12 @@
 
   virtual bool IsCommutative() { return true; }
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x + y; }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x + y; }
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    return x + y;
+  }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    return x + y;
+  }
 
   DECLARE_INSTRUCTION(Add);
 
@@ -1506,8 +1535,12 @@
 
   virtual bool IsCommutative() { return false; }
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x + y; }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x + y; }
+  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    return x - y;
+  }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    return x - y;
+  }
 
   DECLARE_INSTRUCTION(Sub);
 
diff --git a/compiler/optimizing/nodes_test.cc b/compiler/optimizing/nodes_test.cc
index b75bacb..70dd8d7 100644
--- a/compiler/optimizing/nodes_test.cc
+++ b/compiler/optimizing/nodes_test.cc
@@ -63,4 +63,55 @@
   ASSERT_FALSE(parameter->HasUses());
 }
 
+/**
+ * Test that inserting an instruction in the graph updates user lists.
+ */
+TEST(Node, InsertInstruction) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
+  graph->AddBlock(entry);
+  graph->SetEntryBlock(entry);
+  HInstruction* parameter1 = new (&allocator) HParameterValue(0, Primitive::kPrimNot);
+  HInstruction* parameter2 = new (&allocator) HParameterValue(0, Primitive::kPrimNot);
+  entry->AddInstruction(parameter1);
+  entry->AddInstruction(parameter2);
+  entry->AddInstruction(new (&allocator) HExit());
+
+  ASSERT_FALSE(parameter1->HasUses());
+  ASSERT_EQ(parameter1->NumberOfUses(), 0u);
+
+  HInstruction* to_insert = new (&allocator) HNullCheck(parameter1, 0);
+  entry->InsertInstructionBefore(to_insert, parameter2);
+
+  ASSERT_TRUE(parameter1->HasUses());
+  ASSERT_EQ(parameter1->NumberOfUses(), 1u);
+}
+
+/**
+ * Test that adding an instruction in the graph updates user lists.
+ */
+TEST(Node, AddInstruction) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
+  graph->AddBlock(entry);
+  graph->SetEntryBlock(entry);
+  HInstruction* parameter = new (&allocator) HParameterValue(0, Primitive::kPrimNot);
+  entry->AddInstruction(parameter);
+
+  ASSERT_FALSE(parameter->HasUses());
+  ASSERT_EQ(parameter->NumberOfUses(), 0u);
+
+  HInstruction* to_add = new (&allocator) HNullCheck(parameter, 0);
+  entry->AddInstruction(to_add);
+
+  ASSERT_TRUE(parameter->HasUses());
+  ASSERT_EQ(parameter->NumberOfUses(), 1u);
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 65bdb18..3cf5a0b 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -28,6 +28,7 @@
 #include "gvn.h"
 #include "instruction_simplifier.h"
 #include "nodes.h"
+#include "prepare_for_register_allocation.h"
 #include "register_allocator.h"
 #include "ssa_phi_elimination.h"
 #include "ssa_liveness_analysis.h"
@@ -265,6 +266,7 @@
     InstructionSimplifier(graph).Run();
     GlobalValueNumberer(graph->GetArena(), graph).Run();
     visualizer.DumpGraph(kGVNPassName);
+    PrepareForRegisterAllocation(graph).Run();
 
     SsaLivenessAnalysis liveness(*graph, codegen);
     liveness.Analyze();
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index 6dd53e5..5b693dd 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -68,8 +68,10 @@
 }
 
 // Create a control-flow graph from Dex instructions.
-inline HGraph* CreateCFG(ArenaAllocator* allocator, const uint16_t* data) {
-  HGraphBuilder builder(allocator);
+inline HGraph* CreateCFG(ArenaAllocator* allocator,
+                         const uint16_t* data,
+                         Primitive::Type return_type = Primitive::kPrimInt) {
+  HGraphBuilder builder(allocator, return_type);
   const DexFile::CodeItem* item =
     reinterpret_cast<const DexFile::CodeItem*>(data);
   HGraph* graph = builder.BuildGraph(*item);
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
new file mode 100644
index 0000000..bfbbab5
--- /dev/null
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "prepare_for_register_allocation.h"
+
+namespace art {
+
+void PrepareForRegisterAllocation::Run() {
+  // Order does not matter.
+  for (HReversePostOrderIterator it(*GetGraph()); !it.Done(); it.Advance()) {
+    HBasicBlock* block = it.Current();
+    // No need to visit the phis.
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      it.Current()->Accept(this);
+    }
+  }
+}
+
+void PrepareForRegisterAllocation::VisitNullCheck(HNullCheck* check) {
+  check->ReplaceWith(check->InputAt(0));
+}
+
+void PrepareForRegisterAllocation::VisitBoundsCheck(HBoundsCheck* check) {
+  check->ReplaceWith(check->InputAt(0));
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h
new file mode 100644
index 0000000..37f2871
--- /dev/null
+++ b/compiler/optimizing/prepare_for_register_allocation.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_PREPARE_FOR_REGISTER_ALLOCATION_H_
+#define ART_COMPILER_OPTIMIZING_PREPARE_FOR_REGISTER_ALLOCATION_H_
+
+#include "nodes.h"
+
+namespace art {
+
+/**
+ * A simplification pass over the graph before doing register allocation.
+ * For example it changes uses of null checks and bounds checks to the original
+ * objects, to avoid creating a live range for these checks.
+ */
+class PrepareForRegisterAllocation : public HGraphVisitor {
+ public:
+  explicit PrepareForRegisterAllocation(HGraph* graph) : HGraphVisitor(graph) {}
+
+  void Run();
+
+ private:
+  virtual void VisitNullCheck(HNullCheck* check) OVERRIDE;
+  virtual void VisitBoundsCheck(HBoundsCheck* check) OVERRIDE;
+
+  DISALLOW_COPY_AND_ASSIGN(PrepareForRegisterAllocation);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_PREPARE_FOR_REGISTER_ALLOCATION_H_
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 1d1d694..3ee1afe 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -150,8 +150,7 @@
     if (temp.IsRegister()) {
       BlockRegister(temp, position, position + 1, Primitive::kPrimInt);
     } else {
-      LiveInterval* interval =
-          LiveInterval::MakeTempInterval(allocator_, instruction, Primitive::kPrimInt);
+      LiveInterval* interval = LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt);
       temp_intervals_.Add(interval);
       interval->AddRange(position, position + 1);
       unhandled_core_intervals_.Add(interval);
@@ -486,12 +485,18 @@
     reg = current->GetRegister();
     DCHECK_NE(free_until[reg], 0u);
   } else {
-    // Pick the register that is free the longest.
-    for (size_t i = 0; i < number_of_registers_; ++i) {
-      if (IsBlocked(i)) continue;
-      if (reg == -1 || free_until[i] > free_until[reg]) {
-        reg = i;
-        if (free_until[i] == kMaxLifetimePosition) break;
+    int hint = current->FindFirstRegisterHint(free_until);
+    if (hint != kNoRegister) {
+      DCHECK(!IsBlocked(hint));
+      reg = hint;
+    } else {
+      // Pick the register that is free the longest.
+      for (size_t i = 0; i < number_of_registers_; ++i) {
+        if (IsBlocked(i)) continue;
+        if (reg == -1 || free_until[i] > free_until[reg]) {
+          reg = i;
+          if (free_until[i] == kMaxLifetimePosition) break;
+        }
       }
     }
   }
@@ -654,10 +659,6 @@
   }
 }
 
-static bool NeedTwoSpillSlot(Primitive::Type type) {
-  return type == Primitive::kPrimLong || type == Primitive::kPrimDouble;
-}
-
 void RegisterAllocator::AllocateSpillSlotFor(LiveInterval* interval) {
   LiveInterval* parent = interval->GetParent();
 
@@ -698,7 +699,7 @@
     }
   }
 
-  if (NeedTwoSpillSlot(parent->GetType())) {
+  if (parent->NeedsTwoSpillSlots()) {
     if (slot == spill_slots_.Size()) {
       // We need a new spill slot.
       spill_slots_.Add(end);
@@ -722,24 +723,6 @@
   parent->SetSpillSlot((slot + reserved_out_slots_) * kVRegSize);
 }
 
-static Location ConvertToLocation(LiveInterval* interval) {
-  if (interval->HasRegister()) {
-    return Location::RegisterLocation(ManagedRegister(interval->GetRegister()));
-  } else {
-    HInstruction* defined_by = interval->GetParent()->GetDefinedBy();
-    if (defined_by->IsConstant()) {
-      return defined_by->GetLocations()->Out();
-    } else {
-      DCHECK(interval->GetParent()->HasSpillSlot());
-      if (NeedTwoSpillSlot(interval->GetType())) {
-        return Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot());
-      } else {
-        return Location::StackSlot(interval->GetParent()->GetSpillSlot());
-      }
-    }
-  }
-}
-
 // We create a special marker for inputs moves to differentiate them from
 // moves created during resolution. They must be different instructions
 // because the input moves work on the assumption that the interval moves
@@ -795,7 +778,7 @@
     move = at->GetNext()->AsParallelMove();
     // This is a parallel move for connecting siblings in a same block. We need to
     // differentiate it with moves for connecting blocks, and input moves.
-    if (move == nullptr || move->GetLifetimePosition() != position) {
+    if (move == nullptr || IsInputMove(move) || move->GetLifetimePosition() > position) {
       move = new (allocator_) HParallelMove(allocator_);
       move->SetLifetimePosition(position);
       at->GetBlock()->InsertInstructionBefore(move, at->GetNext());
@@ -803,16 +786,11 @@
   } else {
     // Move must happen before the instruction.
     HInstruction* previous = at->GetPrevious();
-    if (previous != nullptr && previous->IsParallelMove()) {
+    if (previous != nullptr && previous->IsParallelMove() && IsInputMove(previous)) {
       // This is a parallel move for connecting siblings in a same block. We need to
-      // differentiate it with moves for connecting blocks, and input moves.
-      if (previous->GetLifetimePosition() != position) {
-        // If the previous instruction of the previous instruction is not a parallel
-        // move, we have to insert the new parallel move before the input or connecting
-        // block moves.
-        at = previous;
-        previous = previous->GetPrevious();
-      }
+      // differentiate it with input moves.
+      at = previous;
+      previous = previous->GetPrevious();
     }
     if (previous == nullptr
         || !previous->IsParallelMove()
@@ -830,6 +808,7 @@
       move = previous->AsParallelMove();
     }
   }
+  DCHECK_EQ(move->GetLifetimePosition(), position);
   move->AddMove(new (allocator_) MoveOperands(source, destination, instruction));
 }
 
@@ -906,7 +885,7 @@
     // We spill eagerly, so move must be at definition.
     InsertMoveAfter(interval->GetDefinedBy(),
                     Location::RegisterLocation(ManagedRegister(interval->GetRegister())),
-                    NeedTwoSpillSlot(interval->GetType())
+                    interval->NeedsTwoSpillSlots()
                         ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot())
                         : Location::StackSlot(interval->GetParent()->GetSpillSlot()));
   }
@@ -915,7 +894,7 @@
   // Walk over all siblings, updating locations of use positions, and
   // connecting them when they are adjacent.
   do {
-    Location source = ConvertToLocation(current);
+    Location source = current->ToLocation();
 
     // Walk over all uses covered by this interval, and update the location
     // information.
@@ -940,7 +919,7 @@
     if (next_sibling != nullptr
         && next_sibling->HasRegister()
         && current->GetEnd() == next_sibling->GetStart()) {
-      Location destination = ConvertToLocation(next_sibling);
+      Location destination = next_sibling->ToLocation();
       InsertParallelMoveAt(current->GetEnd(), interval->GetDefinedBy(), source, destination);
     }
 
@@ -1030,27 +1009,17 @@
   if (from->GetSuccessors().Size() == 1) {
     InsertParallelMoveAtExitOf(from,
                                interval->GetParent()->GetDefinedBy(),
-                               ConvertToLocation(source),
-                               ConvertToLocation(destination));
+                               source->ToLocation(),
+                               destination->ToLocation());
   } else {
     DCHECK_EQ(to->GetPredecessors().Size(), 1u);
     InsertParallelMoveAtEntryOf(to,
                                 interval->GetParent()->GetDefinedBy(),
-                                ConvertToLocation(source),
-                                ConvertToLocation(destination));
+                                source->ToLocation(),
+                                destination->ToLocation());
   }
 }
 
-// Returns the location of `interval`, or siblings of `interval`, at `position`.
-static Location FindLocationAt(LiveInterval* interval, size_t position) {
-  LiveInterval* current = interval;
-  while (!current->Covers(position)) {
-    current = current->GetNextSibling();
-    DCHECK(current != nullptr);
-  }
-  return ConvertToLocation(current);
-}
-
 void RegisterAllocator::Resolve() {
   codegen_->ComputeFrameSize(
       spill_slots_.Size(), maximum_number_of_live_registers_, reserved_out_slots_);
@@ -1077,7 +1046,7 @@
       }
     }
 
-    Location source = ConvertToLocation(current);
+    Location source = current->ToLocation();
 
     if (location.IsUnallocated()) {
       if (location.GetPolicy() == Location::kSameAsFirstInput) {
@@ -1117,9 +1086,9 @@
         HBasicBlock* predecessor = current->GetPredecessors().Get(i);
         DCHECK_EQ(predecessor->GetSuccessors().Size(), 1u);
         HInstruction* input = phi->InputAt(i);
-        Location source = FindLocationAt(input->GetLiveInterval(),
-                                         predecessor->GetLastInstruction()->GetLifetimePosition());
-        Location destination = ConvertToLocation(phi->GetLiveInterval());
+        Location source = input->GetLiveInterval()->GetLocationAt(
+            predecessor->GetLifetimeEnd() - 1);
+        Location destination = phi->GetLiveInterval()->ToLocation();
         InsertParallelMoveAtExitOf(predecessor, nullptr, source, destination);
       }
     }
@@ -1130,11 +1099,12 @@
   size_t temp_index = 0;
   for (size_t i = 0; i < temp_intervals_.Size(); ++i) {
     LiveInterval* temp = temp_intervals_.Get(i);
-    if (temp->GetDefinedBy() != current) {
+    HInstruction* at = liveness_.GetTempUser(temp);
+    if (at != current) {
       temp_index = 0;
-      current = temp->GetDefinedBy();
+      current = at;
     }
-    LocationSummary* locations = current->GetLocations();
+    LocationSummary* locations = at->GetLocations();
     locations->SetTempAt(
         temp_index++, Location::RegisterLocation(ManagedRegister(temp->GetRegister())));
   }
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
index 535a768..b7d56e6 100644
--- a/compiler/optimizing/register_allocator_test.cc
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -25,6 +25,7 @@
 #include "ssa_liveness_analysis.h"
 #include "ssa_phi_elimination.h"
 #include "utils/arena_allocator.h"
+#include "utils/managed_register.h"
 
 #include "gtest/gtest.h"
 
@@ -418,17 +419,17 @@
   // Add three temps holding the same register, and starting at different positions.
   // Put the one that should be picked in the middle of the inactive list to ensure
   // we do not depend on an order.
-  LiveInterval* interval = LiveInterval::MakeTempInterval(&allocator, nullptr, Primitive::kPrimInt);
+  LiveInterval* interval = LiveInterval::MakeTempInterval(&allocator, Primitive::kPrimInt);
   interval->SetRegister(0);
   interval->AddRange(40, 50);
   register_allocator.inactive_.Add(interval);
 
-  interval = LiveInterval::MakeTempInterval(&allocator, nullptr, Primitive::kPrimInt);
+  interval = LiveInterval::MakeTempInterval(&allocator, Primitive::kPrimInt);
   interval->SetRegister(0);
   interval->AddRange(20, 30);
   register_allocator.inactive_.Add(interval);
 
-  interval = LiveInterval::MakeTempInterval(&allocator, nullptr, Primitive::kPrimInt);
+  interval = LiveInterval::MakeTempInterval(&allocator, Primitive::kPrimInt);
   interval->SetRegister(0);
   interval->AddRange(60, 70);
   register_allocator.inactive_.Add(interval);
@@ -447,4 +448,250 @@
   ASSERT_EQ(20u, register_allocator.unhandled_->Get(0)->GetStart());
 }
 
+static HGraph* BuildIfElseWithPhi(ArenaAllocator* allocator,
+                                  HPhi** phi,
+                                  HInstruction** input1,
+                                  HInstruction** input2) {
+  HGraph* graph = new (allocator) HGraph(allocator);
+  HBasicBlock* entry = new (allocator) HBasicBlock(graph);
+  graph->AddBlock(entry);
+  graph->SetEntryBlock(entry);
+  HInstruction* parameter = new (allocator) HParameterValue(0, Primitive::kPrimNot);
+  entry->AddInstruction(parameter);
+
+  HBasicBlock* block = new (allocator) HBasicBlock(graph);
+  graph->AddBlock(block);
+  entry->AddSuccessor(block);
+
+  HInstruction* test = new (allocator) HInstanceFieldGet(
+      parameter, Primitive::kPrimBoolean, MemberOffset(22));
+  block->AddInstruction(test);
+  block->AddInstruction(new (allocator) HIf(test));
+  HBasicBlock* then = new (allocator) HBasicBlock(graph);
+  HBasicBlock* else_ = new (allocator) HBasicBlock(graph);
+  HBasicBlock* join = new (allocator) HBasicBlock(graph);
+  graph->AddBlock(then);
+  graph->AddBlock(else_);
+  graph->AddBlock(join);
+
+  block->AddSuccessor(then);
+  block->AddSuccessor(else_);
+  then->AddSuccessor(join);
+  else_->AddSuccessor(join);
+  then->AddInstruction(new (allocator) HGoto());
+  else_->AddInstruction(new (allocator) HGoto());
+
+  *phi = new (allocator) HPhi(allocator, 0, 0, Primitive::kPrimInt);
+  join->AddPhi(*phi);
+  *input1 = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt, MemberOffset(42));
+  *input2 = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt, MemberOffset(42));
+  then->AddInstruction(*input1);
+  else_->AddInstruction(*input2);
+  join->AddInstruction(new (allocator) HExit());
+  (*phi)->AddInput(*input1);
+  (*phi)->AddInput(*input2);
+
+  graph->BuildDominatorTree();
+  graph->FindNaturalLoops();
+  return graph;
+}
+
+TEST(RegisterAllocatorTest, PhiHint) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HPhi *phi;
+  HInstruction *input1, *input2;
+
+  {
+    HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
+    x86::CodeGeneratorX86 codegen(graph);
+    SsaLivenessAnalysis liveness(*graph, &codegen);
+    liveness.Analyze();
+
+    // Check that the register allocator is deterministic.
+    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+    register_allocator.AllocateRegisters();
+
+    ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 0);
+    ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 0);
+    ASSERT_EQ(phi->GetLiveInterval()->GetRegister(), 0);
+  }
+
+  {
+    HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
+    x86::CodeGeneratorX86 codegen(graph);
+    SsaLivenessAnalysis liveness(*graph, &codegen);
+    liveness.Analyze();
+
+    // Set the phi to a specific register, and check that the inputs get allocated
+    // the same register.
+    phi->GetLocations()->SetOut(Location::RegisterLocation(ManagedRegister(2)));
+    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+    register_allocator.AllocateRegisters();
+
+    ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
+    ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
+    ASSERT_EQ(phi->GetLiveInterval()->GetRegister(), 2);
+  }
+
+  {
+    HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
+    x86::CodeGeneratorX86 codegen(graph);
+    SsaLivenessAnalysis liveness(*graph, &codegen);
+    liveness.Analyze();
+
+    // Set input1 to a specific register, and check that the phi and other input get allocated
+    // the same register.
+    input1->GetLocations()->SetOut(Location::RegisterLocation(ManagedRegister(2)));
+    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+    register_allocator.AllocateRegisters();
+
+    ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
+    ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
+    ASSERT_EQ(phi->GetLiveInterval()->GetRegister(), 2);
+  }
+
+  {
+    HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
+    x86::CodeGeneratorX86 codegen(graph);
+    SsaLivenessAnalysis liveness(*graph, &codegen);
+    liveness.Analyze();
+
+    // Set input2 to a specific register, and check that the phi and other input get allocated
+    // the same register.
+    input2->GetLocations()->SetOut(Location::RegisterLocation(ManagedRegister(2)));
+    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+    register_allocator.AllocateRegisters();
+
+    ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
+    ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
+    ASSERT_EQ(phi->GetLiveInterval()->GetRegister(), 2);
+  }
+}
+
+static HGraph* BuildFieldReturn(ArenaAllocator* allocator,
+                                HInstruction** field,
+                                HInstruction** ret) {
+  HGraph* graph = new (allocator) HGraph(allocator);
+  HBasicBlock* entry = new (allocator) HBasicBlock(graph);
+  graph->AddBlock(entry);
+  graph->SetEntryBlock(entry);
+  HInstruction* parameter = new (allocator) HParameterValue(0, Primitive::kPrimNot);
+  entry->AddInstruction(parameter);
+
+  HBasicBlock* block = new (allocator) HBasicBlock(graph);
+  graph->AddBlock(block);
+  entry->AddSuccessor(block);
+
+  *field = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt, MemberOffset(42));
+  block->AddInstruction(*field);
+  *ret = new (allocator) HReturn(*field);
+  block->AddInstruction(*ret);
+
+  HBasicBlock* exit = new (allocator) HBasicBlock(graph);
+  graph->AddBlock(exit);
+  block->AddSuccessor(exit);
+  exit->AddInstruction(new (allocator) HExit());
+  return graph;
+}
+
+TEST(RegisterAllocatorTest, ExpectedInRegisterHint) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HInstruction *field, *ret;
+
+  {
+    HGraph* graph = BuildFieldReturn(&allocator, &field, &ret);
+    x86::CodeGeneratorX86 codegen(graph);
+    SsaLivenessAnalysis liveness(*graph, &codegen);
+    liveness.Analyze();
+
+    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+    register_allocator.AllocateRegisters();
+
+    // Sanity check that in normal conditions, the register should be hinted to 0 (EAX).
+    ASSERT_EQ(field->GetLiveInterval()->GetRegister(), 0);
+  }
+
+  {
+    HGraph* graph = BuildFieldReturn(&allocator, &field, &ret);
+    x86::CodeGeneratorX86 codegen(graph);
+    SsaLivenessAnalysis liveness(*graph, &codegen);
+    liveness.Analyze();
+
+    // Check that the field gets put in the register expected by its use.
+    ret->GetLocations()->SetInAt(0, Location::RegisterLocation(ManagedRegister(2)));
+
+    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+    register_allocator.AllocateRegisters();
+
+    ASSERT_EQ(field->GetLiveInterval()->GetRegister(), 2);
+  }
+}
+
+static HGraph* BuildTwoAdds(ArenaAllocator* allocator,
+                            HInstruction** first_add,
+                            HInstruction** second_add) {
+  HGraph* graph = new (allocator) HGraph(allocator);
+  HBasicBlock* entry = new (allocator) HBasicBlock(graph);
+  graph->AddBlock(entry);
+  graph->SetEntryBlock(entry);
+  HInstruction* parameter = new (allocator) HParameterValue(0, Primitive::kPrimInt);
+  HInstruction* constant1 = new (allocator) HIntConstant(0);
+  HInstruction* constant2 = new (allocator) HIntConstant(0);
+  entry->AddInstruction(parameter);
+  entry->AddInstruction(constant1);
+  entry->AddInstruction(constant2);
+
+  HBasicBlock* block = new (allocator) HBasicBlock(graph);
+  graph->AddBlock(block);
+  entry->AddSuccessor(block);
+
+  *first_add = new (allocator) HAdd(Primitive::kPrimInt, parameter, constant1);
+  block->AddInstruction(*first_add);
+  *second_add = new (allocator) HAdd(Primitive::kPrimInt, *first_add, constant2);
+  block->AddInstruction(*second_add);
+
+  block->AddInstruction(new (allocator) HExit());
+  return graph;
+}
+
+TEST(RegisterAllocatorTest, SameAsFirstInputHint) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HInstruction *first_add, *second_add;
+
+  {
+    HGraph* graph = BuildTwoAdds(&allocator, &first_add, &second_add);
+    x86::CodeGeneratorX86 codegen(graph);
+    SsaLivenessAnalysis liveness(*graph, &codegen);
+    liveness.Analyze();
+
+    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+    register_allocator.AllocateRegisters();
+
+    // Sanity check that in normal conditions, the registers are the same.
+    ASSERT_EQ(first_add->GetLiveInterval()->GetRegister(), 1);
+    ASSERT_EQ(second_add->GetLiveInterval()->GetRegister(), 1);
+  }
+
+  {
+    HGraph* graph = BuildTwoAdds(&allocator, &first_add, &second_add);
+    x86::CodeGeneratorX86 codegen(graph);
+    SsaLivenessAnalysis liveness(*graph, &codegen);
+    liveness.Analyze();
+
+    // check that both adds get the same register.
+    first_add->InputAt(0)->GetLocations()->SetOut(Location::RegisterLocation(ManagedRegister(2)));
+    ASSERT_EQ(first_add->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput);
+    ASSERT_EQ(second_add->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput);
+
+    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+    register_allocator.AllocateRegisters();
+
+    ASSERT_EQ(first_add->GetLiveInterval()->GetRegister(), 2);
+    ASSERT_EQ(second_add->GetLiveInterval()->GetRegister(), 2);
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index cd13d81..1de90b4 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -297,4 +297,136 @@
   return live_in->UnionIfNotIn(live_out, kill);
 }
 
+int LiveInterval::FindFirstRegisterHint(size_t* free_until) const {
+  if (GetParent() == this && defined_by_ != nullptr) {
+    // This is the first interval for the instruction. Try to find
+    // a register based on its definition.
+    DCHECK_EQ(defined_by_->GetLiveInterval(), this);
+    int hint = FindHintAtDefinition();
+    if (hint != kNoRegister && free_until[hint] > GetStart()) {
+      return hint;
+    }
+  }
+
+  UsePosition* use = first_use_;
+  size_t start = GetStart();
+  size_t end = GetEnd();
+  while (use != nullptr && use->GetPosition() <= end) {
+    size_t use_position = use->GetPosition();
+    if (use_position >= start && !use->GetIsEnvironment()) {
+      HInstruction* user = use->GetUser();
+      size_t input_index = use->GetInputIndex();
+      if (user->IsPhi()) {
+        // If the phi has a register, try to use the same.
+        Location phi_location = user->GetLiveInterval()->ToLocation();
+        if (phi_location.IsRegister() && free_until[phi_location.reg().RegId()] >= use_position) {
+          return phi_location.reg().RegId();
+        }
+        const GrowableArray<HBasicBlock*>& predecessors = user->GetBlock()->GetPredecessors();
+        // If the instruction dies at the phi assignment, we can try having the
+        // same register.
+        if (end == predecessors.Get(input_index)->GetLifetimeEnd()) {
+          for (size_t i = 0, e = user->InputCount(); i < e; ++i) {
+            if (i == input_index) {
+              continue;
+            }
+            HInstruction* input = user->InputAt(i);
+            Location location = input->GetLiveInterval()->GetLocationAt(
+                predecessors.Get(i)->GetLifetimeEnd() - 1);
+            if (location.IsRegister() && free_until[location.reg().RegId()] >= use_position) {
+              return location.reg().RegId();
+            }
+          }
+        }
+      } else {
+        // If the instruction is expected in a register, try to use it.
+        LocationSummary* locations = user->GetLocations();
+        Location expected = locations->InAt(use->GetInputIndex());
+        // We use the user's lifetime position - 1 (and not `use_position`) because the
+        // register is blocked at the beginning of the user.
+        size_t position = user->GetLifetimePosition() - 1;
+        if (expected.IsRegister() && free_until[expected.reg().RegId()] >= position) {
+          return expected.reg().RegId();
+        }
+      }
+    }
+    use = use->GetNext();
+  }
+
+  return kNoRegister;
+}
+
+int LiveInterval::FindHintAtDefinition() const {
+  if (defined_by_->IsPhi()) {
+    // Try to use the same register as one of the inputs.
+    const GrowableArray<HBasicBlock*>& predecessors = defined_by_->GetBlock()->GetPredecessors();
+    for (size_t i = 0, e = defined_by_->InputCount(); i < e; ++i) {
+      HInstruction* input = defined_by_->InputAt(i);
+      size_t end = predecessors.Get(i)->GetLifetimeEnd();
+      const LiveInterval& input_interval = input->GetLiveInterval()->GetIntervalAt(end - 1);
+      if (input_interval.GetEnd() == end) {
+        // If the input dies at the end of the predecessor, we know its register can
+        // be reused.
+        Location input_location = input_interval.ToLocation();
+        if (input_location.IsRegister()) {
+          return input_location.reg().RegId();
+        }
+      }
+    }
+  } else {
+    LocationSummary* locations = GetDefinedBy()->GetLocations();
+    Location out = locations->Out();
+    if (out.IsUnallocated() && out.GetPolicy() == Location::kSameAsFirstInput) {
+      // Try to use the same register as the first input.
+      const LiveInterval& input_interval =
+          GetDefinedBy()->InputAt(0)->GetLiveInterval()->GetIntervalAt(GetStart() - 1);
+      if (input_interval.GetEnd() == GetStart()) {
+        // If the input dies at the start of this instruction, we know its register can
+        // be reused.
+        Location location = input_interval.ToLocation();
+        if (location.IsRegister()) {
+          return location.reg().RegId();
+        }
+      }
+    }
+  }
+  return kNoRegister;
+}
+
+bool LiveInterval::NeedsTwoSpillSlots() const {
+  return type_ == Primitive::kPrimLong || type_ == Primitive::kPrimDouble;
+}
+
+Location LiveInterval::ToLocation() const {
+  if (HasRegister()) {
+    return Location::RegisterLocation(ManagedRegister(GetRegister()));
+  } else {
+    HInstruction* defined_by = GetParent()->GetDefinedBy();
+    if (defined_by->IsConstant()) {
+      return defined_by->GetLocations()->Out();
+    } else if (GetParent()->HasSpillSlot()) {
+      if (NeedsTwoSpillSlots()) {
+        return Location::DoubleStackSlot(GetParent()->GetSpillSlot());
+      } else {
+        return Location::StackSlot(GetParent()->GetSpillSlot());
+      }
+    } else {
+      return Location();
+    }
+  }
+}
+
+Location LiveInterval::GetLocationAt(size_t position) const {
+  return GetIntervalAt(position).ToLocation();
+}
+
+const LiveInterval& LiveInterval::GetIntervalAt(size_t position) const {
+  const LiveInterval* current = this;
+  while (!current->Covers(position)) {
+    current = current->GetNextSibling();
+    DCHECK(current != nullptr);
+  }
+  return *current;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index c62e61b..e9bd303 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -23,6 +23,8 @@
 
 class CodeGenerator;
 
+static constexpr int kNoRegister = -1;
+
 class BlockInfo : public ArenaObject {
  public:
   BlockInfo(ArenaAllocator* allocator, const HBasicBlock& block, size_t number_of_ssa_values)
@@ -166,10 +168,8 @@
     return new (allocator) LiveInterval(allocator, type, nullptr, true, reg, false);
   }
 
-  static LiveInterval* MakeTempInterval(ArenaAllocator* allocator,
-                                        HInstruction* defined_by,
-                                        Primitive::Type type) {
-    return new (allocator) LiveInterval(allocator, type, defined_by, false, kNoRegister, true);
+  static LiveInterval* MakeTempInterval(ArenaAllocator* allocator, Primitive::Type type) {
+    return new (allocator) LiveInterval(allocator, type, nullptr, false, kNoRegister, true);
   }
 
   bool IsFixed() const { return is_fixed_; }
@@ -484,6 +484,31 @@
 
   LiveInterval* GetNextSibling() const { return next_sibling_; }
 
+  // Returns the first register hint that is at least free before
+  // the value contained in `free_until`. If none is found, returns
+  // `kNoRegister`.
+  int FindFirstRegisterHint(size_t* free_until) const;
+
+  // If there is enough at the definition site to find a register (for example
+  // it uses the same input as the first input), returns the register as a hint.
+  // Returns kNoRegister otherwise.
+  int FindHintAtDefinition() const;
+
+  // Returns whether the interval needs two (Dex virtual register size `kVRegSize`)
+  // slots for spilling.
+  bool NeedsTwoSpillSlots() const;
+
+  // Converts the location of the interval to a `Location` object.
+  Location ToLocation() const;
+
+  // Returns the location of the interval following its siblings at `position`.
+  Location GetLocationAt(size_t position) const;
+
+  // Finds the interval that covers `position`.
+  const LiveInterval& GetIntervalAt(size_t position) const;
+
+  bool IsTemp() const { return is_temp_; }
+
  private:
   ArenaAllocator* const allocator_;
 
@@ -567,6 +592,12 @@
     return instructions_from_lifetime_position_.Get(index);
   }
 
+  HInstruction* GetTempUser(LiveInterval* temp) const {
+    // A temporary shares the same lifetime start as the instruction that requires it.
+    DCHECK(temp->IsTemp());
+    return GetInstructionFromPosition(temp->GetStart() / 2);
+  }
+
   size_t GetMaxLifetimePosition() const {
     return instructions_from_lifetime_position_.Size() * 2 - 1;
   }
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index f888d46..3ff24b7 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -243,6 +243,17 @@
 }
 
 
+void X86Assembler::movw(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOperandSizeOverride();
+  EmitUint8(0xC7);
+  EmitOperand(0, dst);
+  CHECK(imm.is_uint16() || imm.is_int16());
+  EmitUint8(imm.value() & 0xFF);
+  EmitUint8(imm.value() >> 8);
+}
+
+
 void X86Assembler::leal(Register dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x8D);
@@ -266,6 +277,14 @@
 }
 
 
+void X86Assembler::movaps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x28);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
 void X86Assembler::movss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index ec983d9..1f6f7e6 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -37,6 +37,7 @@
 
   bool is_int8() const { return IsInt(8, value_); }
   bool is_uint8() const { return IsUint(8, value_); }
+  bool is_int16() const { return IsInt(16, value_); }
   bool is_uint16() const { return IsUint(16, value_); }
 
  private:
@@ -251,6 +252,7 @@
   void movsxw(Register dst, const Address& src);
   void movw(Register dst, const Address& src);
   void movw(const Address& dst, Register src);
+  void movw(const Address& dst, const Immediate& imm);
 
   void leal(Register dst, const Address& src);
 
@@ -258,6 +260,7 @@
 
   void setb(Condition condition, Register dst);
 
+  void movaps(XmmRegister dst, XmmRegister src);
   void movss(XmmRegister dst, const Address& src);
   void movss(const Address& dst, XmmRegister src);
   void movss(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index f432e66..705b639 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -234,6 +234,7 @@
 
 void X86_64Assembler::movb(const Address& dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitUint8(0xC6);
   EmitOperand(Register::RAX, dst);
   CHECK(imm.is_int8());
@@ -291,6 +292,18 @@
 }
 
 
+void X86_64Assembler::movw(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOperandSizeOverride();
+  EmitOptionalRex32(dst);
+  EmitUint8(0xC7);
+  EmitOperand(Register::RAX, dst);
+  CHECK(imm.is_uint16() || imm.is_int16());
+  EmitUint8(imm.value() & 0xFF);
+  EmitUint8(imm.value() >> 8);
+}
+
+
 void X86_64Assembler::leaq(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitRex64(dst, src);
@@ -299,6 +312,15 @@
 }
 
 
+void X86_64Assembler::movaps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x28);
+  EmitXmmRegisterOperand(src.LowBits(), dst);
+}
+
+
 void X86_64Assembler::movss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 1fd65c2..268f72b 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -44,6 +44,7 @@
 
   bool is_int8() const { return IsInt(8, value_); }
   bool is_uint8() const { return IsUint(8, value_); }
+  bool is_int16() const { return IsInt(16, value_); }
   bool is_uint16() const { return IsUint(16, value_); }
   bool is_int32() const {
     // This does not work on 32b machines: return IsInt(32, value_);
@@ -295,9 +296,12 @@
   void movsxw(CpuRegister dst, const Address& src);
   void movw(CpuRegister dst, const Address& src);
   void movw(const Address& dst, CpuRegister src);
+  void movw(const Address& dst, const Immediate& imm);
 
   void leaq(CpuRegister dst, const Address& src);
 
+  void movaps(XmmRegister dst, XmmRegister src);
+
   void movss(XmmRegister dst, const Address& src);
   void movss(const Address& dst, XmmRegister src);
   void movss(XmmRegister dst, XmmRegister src);
diff --git a/test/407-arrays/src/Main.java b/test/407-arrays/src/Main.java
index b5e95b0..d5c5604 100644
--- a/test/407-arrays/src/Main.java
+++ b/test/407-arrays/src/Main.java
@@ -70,6 +70,15 @@
     chars[index] = 'd';
     assertEquals('d', chars[index]);
 
+    chars[0] = 65535;
+    assertEquals(65535, chars[0]);
+    // Do an update between the two max value updates, to avoid
+    // optimizing the second away.
+    chars[index] = 0;
+    assertEquals(0, chars[index]);
+    chars[index] = 65535;
+    assertEquals(65535, chars[index]);
+
     shorts[0] = -42;
     assertEquals(-42, shorts[0]);
     shorts[index] = -84;
@@ -86,7 +95,13 @@
     Object o2 = new Object();
     objects[index] = o2;
     assertEquals(o2, objects[index]);
+    // Longs are initially not supported in the linear scan register allocator
+    // on 32bits. So we call out a long helper to ensure this method gets
+    // optimized.
+    $opt$testLongWrites(longs, index);
+  }
 
+  public static void $opt$testLongWrites(long[] longs, int index) {
     long l = -21876876876876876L;
     longs[0] = l;
     assertEquals(l, longs[0]);
diff --git a/test/410-floats/expected.txt b/test/410-floats/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/410-floats/expected.txt
diff --git a/test/410-floats/info.txt b/test/410-floats/info.txt
new file mode 100644
index 0000000..5332704
--- /dev/null
+++ b/test/410-floats/info.txt
@@ -0,0 +1 @@
+Small tests involving floats and doubles.
diff --git a/test/410-floats/src/Main.java b/test/410-floats/src/Main.java
new file mode 100644
index 0000000..d8d6fac
--- /dev/null
+++ b/test/410-floats/src/Main.java
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    assertEquals(4.2f, returnFloat());
+    float[] a = new float[1];
+    a[0] = 42.2f;
+    assertEquals(42.2f, returnFloat(a));
+
+    assertEquals(4.4, returnDouble());
+    double[] b = new double[1];
+    b[0] = 42.4;
+    assertEquals(42.4, returnDouble(b));
+
+    assertEquals(4.2f, invokeReturnFloat());
+    assertEquals(4.4, invokeReturnDouble());
+    assertEquals(4.2f, takeAFloat(4.2f));
+    assertEquals(3.1, takeADouble(3.1));
+    assertEquals(12.7, takeThreeDouble(3.1, 4.4, 5.2));
+    assertEquals(12.7f, takeThreeFloat(3.1f, 4.4f, 5.2f));
+    assertEquals(4.2f, invokeTakeAFloat(4.2f));
+    assertEquals(3.1, invokeTakeADouble(3.1));
+    assertEquals(12.7, invokeTakeThreeDouble(3.1, 4.4, 5.2));
+    assertEquals(12.7f, invokeTakeThreeFloat(3.1f, 4.4f, 5.2f));
+  }
+
+  public static float invokeReturnFloat() {
+    return returnFloat();
+  }
+
+  public static double invokeReturnDouble() {
+    return returnDouble();
+  }
+
+  public static float returnFloat() {
+    return 4.2f;
+  }
+
+  public static float returnFloat(float[] a) {
+    return a[0];
+  }
+
+  public static double returnDouble() {
+    return 4.4;
+  }
+
+  public static double returnDouble(double[] a) {
+    return a[0];
+  }
+
+  public static float takeAFloat(float a) {
+    return a;
+  }
+
+  public static double takeADouble(double a) {
+    return a;
+  }
+
+  public static double takeThreeDouble(double a, double b, double c) {
+    return a + b + c;
+  }
+
+  public static float takeThreeFloat(float a, float b, float c) {
+    return a + b + c;
+  }
+
+  public static float invokeTakeAFloat(float a) {
+    return takeAFloat(a);
+  }
+
+  public static double invokeTakeADouble(double a) {
+    return takeADouble(a);
+  }
+
+  public static double invokeTakeThreeDouble(double a, double b, double c) {
+    return takeThreeDouble(a, b, c);
+  }
+
+  public static float invokeTakeThreeFloat(float a, float b, float c) {
+    return takeThreeFloat(a, b, c);
+  }
+
+  public static void assertEquals(float expected, float actual) {
+    if (expected != actual) {
+      throw new AssertionError("Expected " + expected + " got " + actual);
+    }
+  }
+
+  public static void assertEquals(double expected, double actual) {
+    if (expected != actual) {
+      throw new AssertionError("Expected " + expected + " got " + actual);
+    }
+  }
+}