Merge "AArch64: Fix quick compiler monitor implementation."
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index 144594e..3bc060b 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -147,8 +147,8 @@
     // Instruction::MOVE_RESULT,
     // Instruction::MOVE_RESULT_WIDE,
     // Instruction::MOVE_RESULT_OBJECT,
-    // Instruction::MOVE_EXCEPTION,
-    // Instruction::RETURN_VOID,
+    Instruction::MOVE_EXCEPTION,
+    Instruction::RETURN_VOID,
     // Instruction::RETURN,
     // Instruction::RETURN_WIDE,
     // Instruction::RETURN_OBJECT,
@@ -163,8 +163,8 @@
     // Instruction::CONST_STRING,
     // Instruction::CONST_STRING_JUMBO,
     // Instruction::CONST_CLASS,
-    // Instruction::MONITOR_ENTER,
-    // Instruction::MONITOR_EXIT,
+    Instruction::MONITOR_ENTER,
+    Instruction::MONITOR_EXIT,
     // Instruction::CHECK_CAST,
     // Instruction::INSTANCE_OF,
     // Instruction::ARRAY_LENGTH,
@@ -173,7 +173,7 @@
     // Instruction::FILLED_NEW_ARRAY,
     // Instruction::FILLED_NEW_ARRAY_RANGE,
     // Instruction::FILL_ARRAY_DATA,
-    // Instruction::THROW,
+    Instruction::THROW,
     // Instruction::GOTO,
     // Instruction::GOTO_16,
     // Instruction::GOTO_32,
@@ -230,14 +230,14 @@
     // Instruction::IPUT_BYTE,
     // Instruction::IPUT_CHAR,
     // Instruction::IPUT_SHORT,
-    // Instruction::SGET,
+    Instruction::SGET,
     // Instruction::SGET_WIDE,
-    // Instruction::SGET_OBJECT,
+    Instruction::SGET_OBJECT,
     // Instruction::SGET_BOOLEAN,
     // Instruction::SGET_BYTE,
     // Instruction::SGET_CHAR,
     // Instruction::SGET_SHORT,
-    // Instruction::SPUT,
+    Instruction::SPUT,
     // Instruction::SPUT_WIDE,
     // Instruction::SPUT_OBJECT,
     // Instruction::SPUT_BOOLEAN,
@@ -350,7 +350,7 @@
     // Instruction::AND_INT_LIT16,
     // Instruction::OR_INT_LIT16,
     // Instruction::XOR_INT_LIT16,
-    // Instruction::ADD_INT_LIT8,
+    Instruction::ADD_INT_LIT8,
     // Instruction::RSUB_INT_LIT8,
     // Instruction::MUL_INT_LIT8,
     // Instruction::DIV_INT_LIT8,
@@ -403,7 +403,7 @@
     // kMirOpNullCheck,
     // kMirOpRangeCheck,
     // kMirOpDivZeroCheck,
-    // kMirOpCheck,
+    kMirOpCheck,
     // kMirOpCheckPart2,
     // kMirOpSelect,
     // kMirOpLast,
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index 7ae4b02..f98e366 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -151,6 +151,9 @@
   rxzr = rx31,
   rwsp = rw31,
   rsp = rx31,
+  // TODO: rx4 is an argument register in C ABI which is not a good idea,
+  // But we need to decide to use caller save register in C ABI or callee save register.
+  // Because it will result to different implementation in the trampoline.
   rA64_SUSPEND = rx4,
   rA64_SELF = rx18,
   rA64_SP = rx31,
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 8accd0a..93caf89 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -176,7 +176,7 @@
                  kFmtRegROrSp, 9, 5, kFmtBitBlt, 21, 10, kFmtBitBlt, 23, 22,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE0 | SETS_CCODES,
                  "cmn", "!0R, #!1d!2T", kFixupNone),
-    ENCODING_MAP(WIDE(kA64Cmp3Rro), SF_VARIANTS(0x6b20001f),
+    ENCODING_MAP(WIDE(kA64Cmp3Rro), SF_VARIANTS(0x6b00001f),
                  kFmtRegROrSp, 9, 5, kFmtRegR, 20, 16, kFmtShift, -1, -1,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE01 | SETS_CCODES,
                  "cmp", "!0R, !1r!2o", kFixupNone),
@@ -637,7 +637,7 @@
               }
 
               // Now check that the requirements are satisfied.
-              RegStorage reg(operand);
+              RegStorage reg(operand | RegStorage::kValid);
               const char *expected = nullptr;
               if (want_float) {
                 if (!reg.IsFloat()) {
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 1bcf19b..136a04f 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -194,137 +194,101 @@
  * details see monitor.cc.
  */
 void Arm64Mir2Lir::GenMonitorEnter(int opt_flags, RegLocation rl_src) {
+  // x0/w0 = object
+  // w1    = thin lock thread id
+  // x2    = address of lock word
+  // w3    = lock word / store failure
+  // TUNING: How much performance we get when we inline this?
+  // Since we've already flush all register.
   FlushAllRegs();
-  // FIXME: need separate LoadValues for object references.
-  LoadValueDirectFixed(rl_src, rs_x0);  // Get obj
+  LoadValueDirectFixed(rl_src, rs_w0);
   LockCallTemps();  // Prepare for explicit register usage
-  constexpr bool kArchVariantHasGoodBranchPredictor = false;  // TODO: true if cortex-A15.
-  if (kArchVariantHasGoodBranchPredictor) {
-    LIR* null_check_branch = nullptr;
-    if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
-      null_check_branch = nullptr;  // No null check.
-    } else {
-      // If the null-check fails its handled by the slow-path to reduce exception related meta-data.
-      if (Runtime::Current()->ExplicitNullChecks()) {
-        null_check_branch = OpCmpImmBranch(kCondEq, rs_x0, 0, NULL);
-      }
-    }
-    Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_x2);
-    NewLIR3(kA64Ldxr2rX, rx1, rx0, mirror::Object::MonitorOffset().Int32Value() >> 2);
-    MarkPossibleNullPointerException(opt_flags);
-    LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_x1, 0, NULL);
-    NewLIR4(kA64Stxr3wrX, rx1, rx2, rx0, mirror::Object::MonitorOffset().Int32Value() >> 2);
-    LIR* lock_success_branch = OpCmpImmBranch(kCondEq, rs_x1, 0, NULL);
-
-
-    LIR* slow_path_target = NewLIR0(kPseudoTargetLabel);
-    not_unlocked_branch->target = slow_path_target;
-    if (null_check_branch != nullptr) {
-      null_check_branch->target = slow_path_target;
-    }
-    // TODO: move to a slow path.
-    // Go expensive route - artLockObjectFromCode(obj);
-    LoadWordDisp(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pLockObject).Int32Value(), rs_rA64_LR);
-    ClobberCallerSave();
-    LIR* call_inst = OpReg(kOpBlx, rs_rA64_LR);
-    MarkSafepointPC(call_inst);
-
-    LIR* success_target = NewLIR0(kPseudoTargetLabel);
-    lock_success_branch->target = success_target;
-    GenMemBarrier(kLoadLoad);
+  LIR* null_check_branch = nullptr;
+  if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
+    null_check_branch = nullptr;  // No null check.
   } else {
-    // Explicit null-check as slow-path is entered using an IT.
-    GenNullCheck(rs_x0, opt_flags);
-    Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_x2);
-    MarkPossibleNullPointerException(opt_flags);
-    NewLIR3(kA64Ldxr2rX, rx1, rx0, mirror::Object::MonitorOffset().Int32Value() >> 2);
-    OpRegImm(kOpCmp, rs_x1, 0);
-    OpIT(kCondEq, "");
-    NewLIR4(kA64Stxr3wrX/*eq*/, rx1, rx2, rx0, mirror::Object::MonitorOffset().Int32Value() >> 2);
-    OpRegImm(kOpCmp, rs_x1, 0);
-    OpIT(kCondNe, "T");
-    // Go expensive route - artLockObjectFromCode(self, obj);
-    LoadWordDisp/*ne*/(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pLockObject).Int32Value(),
-                       rs_rA64_LR);
-    ClobberCallerSave();
-    LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rA64_LR);
-    MarkSafepointPC(call_inst);
-    GenMemBarrier(kLoadLoad);
+    // If the null-check fails its handled by the slow-path to reduce exception related meta-data.
+    if (Runtime::Current()->ExplicitNullChecks()) {
+      null_check_branch = OpCmpImmBranch(kCondEq, rs_x0, 0, NULL);
+    }
   }
+  Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_w1);
+  OpRegRegImm(kOpAdd, rs_x2, rs_x0, mirror::Object::MonitorOffset().Int32Value());
+  NewLIR2(kA64Ldxr2rX, rw3, rx2);
+  MarkPossibleNullPointerException(opt_flags);
+  LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_x1, 0, NULL);
+  NewLIR3(kA64Stxr3wrX, rw3, rw1, rx2);
+  LIR* lock_success_branch = OpCmpImmBranch(kCondEq, rs_x1, 0, NULL);
+
+  LIR* slow_path_target = NewLIR0(kPseudoTargetLabel);
+  not_unlocked_branch->target = slow_path_target;
+  if (null_check_branch != nullptr) {
+    null_check_branch->target = slow_path_target;
+  }
+  // TODO: move to a slow path.
+  // Go expensive route - artLockObjectFromCode(obj);
+  LoadWordDisp(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pLockObject).Int32Value(), rs_rA64_LR);
+  ClobberCallerSave();
+  LIR* call_inst = OpReg(kOpBlx, rs_rA64_LR);
+  MarkSafepointPC(call_inst);
+
+  LIR* success_target = NewLIR0(kPseudoTargetLabel);
+  lock_success_branch->target = success_target;
+  GenMemBarrier(kLoadLoad);
 }
 
 /*
  * Handle thin locked -> unlocked transition inline or else call out to quick entrypoint. For more
- * details see monitor.cc. Note the code below doesn't use ldrex/strex as the code holds the lock
+ * details see monitor.cc. Note the code below doesn't use ldxr/stxr as the code holds the lock
  * and can only give away ownership if its suspended.
  */
 void Arm64Mir2Lir::GenMonitorExit(int opt_flags, RegLocation rl_src) {
+  // x0/w0 = object
+  // w1    = thin lock thread id
+  // w2    = lock word
+  // TUNING: How much performance we get when we inline this?
+  // Since we've already flush all register.
   FlushAllRegs();
-  LoadValueDirectFixed(rl_src, rs_x0);  // Get obj
+  LoadValueDirectFixed(rl_src, rs_w0);  // Get obj
   LockCallTemps();  // Prepare for explicit register usage
   LIR* null_check_branch = nullptr;
-  Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_x2);
-  constexpr bool kArchVariantHasGoodBranchPredictor = false;  // TODO: true if cortex-A15.
-  if (kArchVariantHasGoodBranchPredictor) {
-    if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
-      null_check_branch = nullptr;  // No null check.
-    } else {
-      // If the null-check fails its handled by the slow-path to reduce exception related meta-data.
-      if (Runtime::Current()->ExplicitNullChecks()) {
-        null_check_branch = OpCmpImmBranch(kCondEq, rs_x0, 0, NULL);
-      }
-    }
-    Load32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_x1);
-    MarkPossibleNullPointerException(opt_flags);
-    LoadConstantNoClobber(rs_x3, 0);
-    LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_x1, rs_x2, NULL);
-    GenMemBarrier(kStoreLoad);
-    Store32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_x3);
-    LIR* unlock_success_branch = OpUnconditionalBranch(NULL);
-
-    LIR* slow_path_target = NewLIR0(kPseudoTargetLabel);
-    slow_unlock_branch->target = slow_path_target;
-    if (null_check_branch != nullptr) {
-      null_check_branch->target = slow_path_target;
-    }
-    // TODO: move to a slow path.
-    // Go expensive route - artUnlockObjectFromCode(obj);
-    LoadWordDisp(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pUnlockObject).Int32Value(), rs_rA64_LR);
-    ClobberCallerSave();
-    LIR* call_inst = OpReg(kOpBlx, rs_rA64_LR);
-    MarkSafepointPC(call_inst);
-
-    LIR* success_target = NewLIR0(kPseudoTargetLabel);
-    unlock_success_branch->target = success_target;
+  if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
+    null_check_branch = nullptr;  // No null check.
   } else {
-    // Explicit null-check as slow-path is entered using an IT.
-    GenNullCheck(rs_x0, opt_flags);
-    Load32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_x1);  // Get lock
-    MarkPossibleNullPointerException(opt_flags);
-    Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_x2);
-    LoadConstantNoClobber(rs_x3, 0);
-    // Is lock unheld on lock or held by us (==thread_id) on unlock?
-    OpRegReg(kOpCmp, rs_x1, rs_x2);
-    OpIT(kCondEq, "EE");
-    Store32Disp/*eq*/(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_x3);
-    // Go expensive route - UnlockObjectFromCode(obj);
-    LoadWordDisp/*ne*/(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pUnlockObject).Int32Value(),
-                       rs_rA64_LR);
-    ClobberCallerSave();
-    LIR* call_inst = OpReg(kOpBlx/*ne*/, rs_rA64_LR);
-    MarkSafepointPC(call_inst);
-    GenMemBarrier(kStoreLoad);
+    // If the null-check fails its handled by the slow-path to reduce exception related meta-data.
+    if (Runtime::Current()->ExplicitNullChecks()) {
+      null_check_branch = OpCmpImmBranch(kCondEq, rs_x0, 0, NULL);
+    }
   }
+  Load32Disp(rs_rA64_SELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_w1);
+  Load32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_w2);
+  MarkPossibleNullPointerException(opt_flags);
+  LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_w1, rs_w2, NULL);
+  GenMemBarrier(kStoreLoad);
+  Store32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_xzr);
+  LIR* unlock_success_branch = OpUnconditionalBranch(NULL);
+
+  LIR* slow_path_target = NewLIR0(kPseudoTargetLabel);
+  slow_unlock_branch->target = slow_path_target;
+  if (null_check_branch != nullptr) {
+    null_check_branch->target = slow_path_target;
+  }
+  // TODO: move to a slow path.
+  // Go expensive route - artUnlockObjectFromCode(obj);
+  LoadWordDisp(rs_rA64_SELF, QUICK_ENTRYPOINT_OFFSET(8, pUnlockObject).Int32Value(), rs_rA64_LR);
+  ClobberCallerSave();
+  LIR* call_inst = OpReg(kOpBlx, rs_rA64_LR);
+  MarkSafepointPC(call_inst);
+
+  LIR* success_target = NewLIR0(kPseudoTargetLabel);
+  unlock_success_branch->target = success_target;
 }
 
 void Arm64Mir2Lir::GenMoveException(RegLocation rl_dest) {
   int ex_offset = Thread::ExceptionOffset<8>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  RegStorage reset_reg = AllocTemp();
   Load32Disp(rs_rA64_SELF, ex_offset, rl_result.reg);
-  LoadConstant(reset_reg, 0);
-  Store32Disp(rs_rA64_SELF, ex_offset, reset_reg);
-  FreeTemp(reset_reg);
+  Store32Disp(rs_rA64_SELF, ex_offset, rs_xzr);
   StoreValue(rl_dest, rl_result);
 }
 
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 6caacc8..10be0d6 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -38,16 +38,27 @@
      rs_f24, rs_f25, rs_f26, rs_f27, rs_f28, rs_f29, rs_f30, rs_f31};
 static const RegStorage dp_regs_arr[] =
     {rs_d0, rs_d1, rs_d2, rs_d3, rs_d4, rs_d5, rs_d6, rs_d7,
-     rs_d8, rs_d9, rs_d10, rs_d11, rs_d12, rs_d13, rs_d14, rs_d15};
+     rs_d8, rs_d9, rs_d10, rs_d11, rs_d12, rs_d13, rs_d14, rs_d15,
+     rs_d16, rs_d17, rs_d18, rs_d19, rs_d20, rs_d21, rs_d22, rs_d23,
+     rs_d24, rs_d25, rs_d26, rs_d27, rs_d28, rs_d29, rs_d30, rs_d31};
 static const RegStorage reserved_regs_arr[] =
     {rs_rA64_SUSPEND, rs_rA64_SELF, rs_rA64_SP, rs_rA64_LR};
+// TUING: Are there too many temp registers and too less promote target?
+// This definition need to be matched with runtime.cc, quick entry assembly and JNI compiler
+// Note: we are not able to call to C function directly if it un-match C ABI.
+// Currently, rs_rA64_SELF is not a callee save register which does not match C ABI.
 static const RegStorage core_temps_arr[] =
-    {rs_x0, rs_x1, rs_x2, rs_x3, rs_x12};
+    {rs_x0, rs_x1, rs_x2, rs_x3, rs_x4, rs_x5, rs_x6, rs_x7,
+     rs_x8, rs_x9, rs_x10, rs_x11, rs_x12, rs_x13, rs_x14, rs_x15, rs_x16,
+     rs_x17};
 static const RegStorage sp_temps_arr[] =
     {rs_f0, rs_f1, rs_f2, rs_f3, rs_f4, rs_f5, rs_f6, rs_f7,
-     rs_f8, rs_f9, rs_f10, rs_f11, rs_f12, rs_f13, rs_f14, rs_f15};
+     rs_f16, rs_f17, rs_f18, rs_f19, rs_f20, rs_f21, rs_f22, rs_f23,
+     rs_f24, rs_f25, rs_f26, rs_f27, rs_f28, rs_f29, rs_f30, rs_f31};
 static const RegStorage dp_temps_arr[] =
-    {rs_d0, rs_d1, rs_d2, rs_d3, rs_d4, rs_d5, rs_d6, rs_d7};
+    {rs_d0, rs_d1, rs_d2, rs_d3, rs_d4, rs_d5, rs_d6, rs_d7,
+     rs_d16, rs_d17, rs_d18, rs_d19, rs_d20, rs_d21, rs_d22, rs_d23,
+     rs_d24, rs_d25, rs_d26, rs_d27, rs_d28, rs_d29, rs_d30, rs_d31};
 
 static const std::vector<RegStorage> core_regs(core_regs_arr,
     core_regs_arr + arraysize(core_regs_arr));
@@ -877,12 +888,13 @@
   rl_src.home = false;
   MarkLive(rl_src);
 
-  // TODO(Arm64): compress the Method pointer?
-  StoreValueWide(rl_method, rl_src);
+  // rl_method might be 32-bit, but ArtMethod* on stack is 64-bit, so always flush it.
+  StoreWordDisp(TargetReg(kSp), 0, TargetReg(kArg0));
 
-  // If Method* has been promoted, explicitly flush
+  // If Method* has been promoted, load it,
+  // otherwise, rl_method is the 32-bit value on [sp], and has already been loaded.
   if (rl_method.location == kLocPhysReg) {
-    StoreWordDisp(TargetReg(kSp), 0, TargetReg(kArg0));
+    StoreValue(rl_method, rl_src);
   }
 
   if (cu_->num_ins == 0) {