am 21be5b21: Merge "Intrinsic Unsafe.CompareAndSwapLong() for ARM."

* commit '21be5b21017823b3785f94349e2e2b57d82431e6':
  Intrinsic Unsafe.CompareAndSwapLong() for ARM.
diff --git a/compiler/dex/quick/arm/arm_dex_file_method_inliner.cc b/compiler/dex/quick/arm/arm_dex_file_method_inliner.cc
index 257b2c4..59f7202 100644
--- a/compiler/dex/quick/arm/arm_dex_file_method_inliner.cc
+++ b/compiler/dex/quick/arm/arm_dex_file_method_inliner.cc
@@ -66,8 +66,8 @@
 
     INTRINSIC(SunMiscUnsafe, CompareAndSwapInt, ObjectJII_Z, kIntrinsicCas,
               kIntrinsicFlagNone),
-    // INTRINSIC(SunMiscUnsafe, CompareAndSwapLong, ObjectJJJ_Z, kIntrinsicCas,
-    //           kIntrinsicFlagIsLong),
+    INTRINSIC(SunMiscUnsafe, CompareAndSwapLong, ObjectJJJ_Z, kIntrinsicCas,
+              kIntrinsicFlagIsLong),
     INTRINSIC(SunMiscUnsafe, CompareAndSwapObject, ObjectJObjectObject_Z, kIntrinsicCas,
               kIntrinsicFlagIsObject),
 
diff --git a/compiler/dex/quick/arm/arm_lir.h b/compiler/dex/quick/arm/arm_lir.h
index 8cd7c94..395c788 100644
--- a/compiler/dex/quick/arm/arm_lir.h
+++ b/compiler/dex/quick/arm/arm_lir.h
@@ -426,9 +426,11 @@
   kThumb2Vmovd_IMM8,  // vmov.f64 [111011101] D [11] imm4h[19-16] vd[15-12] [10110000] imm4l[3-0].
   kThumb2Mla,        // mla [111110110000] rn[19-16] ra[15-12] rd[7-4] [0000] rm[3-0].
   kThumb2Umull,      // umull [111110111010] rn[19-16], rdlo[15-12] rdhi[11-8] [0000] rm[3-0].
-  kThumb2Ldrex,      // ldrex [111010000101] rn[19-16] rt[11-8] [1111] imm8[7-0].
-  kThumb2Strex,      // strex [111010000100] rn[19-16] rt[11-8] rd[11-8] imm8[7-0].
-  kThumb2Clrex,      // clrex [111100111011111110000111100101111].
+  kThumb2Ldrex,      // ldrex [111010000101] rn[19-16] rt[15-12] [1111] imm8[7-0].
+  kThumb2Ldrexd,     // ldrexd [111010001101] rn[19-16] rt[15-12] rt2[11-8] [11111111].
+  kThumb2Strex,      // strex [111010000100] rn[19-16] rt[15-12] rd[11-8] imm8[7-0].
+  kThumb2Strexd,     // strexd [111010001100] rn[19-16] rt[15-12] rt2[11-8] [0111] Rd[3-0].
+  kThumb2Clrex,      // clrex [11110011101111111000111100101111].
   kThumb2Bfi,        // bfi [111100110110] rn[19-16] [0] imm3[14-12] rd[11-8] imm2[7-6] [0] msb[4-0].
   kThumb2Bfc,        // bfc [11110011011011110] [0] imm3[14-12] rd[11-8] imm2[7-6] [0] msb[4-0].
   kThumb2Dmb,        // dmb [1111001110111111100011110101] option[3-0].
@@ -447,7 +449,7 @@
   kThumb2MovImm16HST,  // Special purpose version for switch table use.
   kThumb2LdmiaWB,    // ldmia  [111010011001[ rn[19..16] mask[15..0].
   kThumb2SubsRRI12,  // setflags encoding.
-  kThumb2OrrRRRs,    // orrx [111010100101] rn[19..16] [0000] rd[11..8] [0000] rm[3..0].
+  kThumb2OrrRRRs,    // orrs [111010100101] rn[19..16] [0000] rd[11..8] [0000] rm[3..0].
   kThumb2Push1,      // t3 encoding of push.
   kThumb2Pop1,       // t3 encoding of pop.
   kThumb2RsubRRR,    // rsb [111010111101] rn[19..16] [0000] rd[11..8] [0000] rm[3..0].
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index 1c81a5a..b3236ae 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -877,8 +877,7 @@
                  "vmov.f64", "!0S, #0x!1h", 4, kFixupNone),
     ENCODING_MAP(kThumb2Mla,  0xfb000000,
                  kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16, kFmtBitBlt, 3, 0,
-                 kFmtBitBlt, 15, 12,
-                 IS_QUAD_OP | REG_DEF0 | REG_USE1 | REG_USE2 | REG_USE3,
+                 kFmtBitBlt, 15, 12, IS_QUAD_OP | REG_DEF0_USE123,
                  "mla", "!0C, !1C, !2C, !3C", 4, kFixupNone),
     ENCODING_MAP(kThumb2Umull,  0xfba00000,
                  kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16,
@@ -889,10 +888,18 @@
                  kFmtBitBlt, 15, 12, kFmtBitBlt, 19, 16, kFmtBitBlt, 7, 0,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE1 | IS_LOAD,
                  "ldrex", "!0C, [!1C, #!2E]", 4, kFixupNone),
+    ENCODING_MAP(kThumb2Ldrexd,      0xe8d0007f,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16,
+                 kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF01_USE2 | IS_LOAD,
+                 "ldrexd", "!0C, !1C, [!2C]", 4, kFixupNone),
     ENCODING_MAP(kThumb2Strex,       0xe8400000,
                  kFmtBitBlt, 11, 8, kFmtBitBlt, 15, 12, kFmtBitBlt, 19, 16,
                  kFmtBitBlt, 7, 0, IS_QUAD_OP | REG_DEF0_USE12 | IS_STORE,
-                 "strex", "!0C,!1C, [!2C, #!2E]", 4, kFixupNone),
+                 "strex", "!0C, !1C, [!2C, #!2E]", 4, kFixupNone),
+    ENCODING_MAP(kThumb2Strexd,      0xe8c00070,
+                 kFmtBitBlt, 3, 0, kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8,
+                 kFmtBitBlt, 19, 16, IS_QUAD_OP | REG_DEF0_USE123 | IS_STORE,
+                 "strexd", "!0C, !1C, !2C, [!3C]", 4, kFixupNone),
     ENCODING_MAP(kThumb2Clrex,       0xf3bf8f2f,
                  kFmtUnused, -1, -1, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, NO_OPERAND,
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 9727179..e839fe5 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -561,22 +561,67 @@
 }
 
 bool ArmMir2Lir::GenInlinedCas(CallInfo* info, bool is_long, bool is_object) {
-  DCHECK(!is_long);  // not supported yet
   DCHECK_EQ(cu_->instruction_set, kThumb2);
   // Unused - RegLocation rl_src_unsafe = info->args[0];
   RegLocation rl_src_obj = info->args[1];  // Object - known non-null
   RegLocation rl_src_offset = info->args[2];  // long low
   rl_src_offset.wide = 0;  // ignore high half in info->args[3]
   RegLocation rl_src_expected = info->args[4];  // int, long or Object
-  RegLocation rl_src_new_value = info->args[5];  // int, long or Object
+  // If is_long, high half is in info->args[5]
+  RegLocation rl_src_new_value = info->args[is_long ? 6 : 5];  // int, long or Object
+  // If is_long, high half is in info->args[7]
   RegLocation rl_dest = InlineTarget(info);  // boolean place for result
 
+  // We have only 5 temporary registers available and actually only 4 if the InlineTarget
+  // above locked one of the temps. For a straightforward CAS64 we need 7 registers:
+  // r_ptr (1), new_value (2), expected(2) and ldrexd result (2). If neither expected nor
+  // new_value is in a non-temp core register we shall reload them in the ldrex/strex loop
+  // into the same temps, reducing the number of required temps down to 5. We shall work
+  // around the potentially locked temp by using LR for r_ptr, unconditionally.
+  // TODO: Pass information about the need for more temps to the stack frame generation
+  // code so that we can rely on being able to allocate enough temps.
+  DCHECK(!reg_pool_->core_regs[rARM_LR].is_temp);
+  MarkTemp(rARM_LR);
+  FreeTemp(rARM_LR);
+  LockTemp(rARM_LR);
+  bool load_early = true;
+  if (is_long) {
+    bool expected_is_core_reg =
+        rl_src_expected.location == kLocPhysReg && !IsFpReg(rl_src_expected.low_reg);
+    bool new_value_is_core_reg =
+        rl_src_new_value.location == kLocPhysReg && !IsFpReg(rl_src_new_value.low_reg);
+    bool expected_is_good_reg = expected_is_core_reg && !IsTemp(rl_src_expected.low_reg);
+    bool new_value_is_good_reg = new_value_is_core_reg && !IsTemp(rl_src_new_value.low_reg);
+
+    if (!expected_is_good_reg && !new_value_is_good_reg) {
+      // None of expected/new_value is non-temp reg, need to load both late
+      load_early = false;
+      // Make sure they are not in the temp regs and the load will not be skipped.
+      if (expected_is_core_reg) {
+        FlushRegWide(rl_src_expected.low_reg, rl_src_expected.high_reg);
+        ClobberSReg(rl_src_expected.s_reg_low);
+        ClobberSReg(GetSRegHi(rl_src_expected.s_reg_low));
+        rl_src_expected.location = kLocDalvikFrame;
+      }
+      if (new_value_is_core_reg) {
+        FlushRegWide(rl_src_new_value.low_reg, rl_src_new_value.high_reg);
+        ClobberSReg(rl_src_new_value.s_reg_low);
+        ClobberSReg(GetSRegHi(rl_src_new_value.s_reg_low));
+        rl_src_new_value.location = kLocDalvikFrame;
+      }
+    }
+  }
 
   // Release store semantics, get the barrier out of the way.  TODO: revisit
   GenMemBarrier(kStoreLoad);
 
   RegLocation rl_object = LoadValue(rl_src_obj, kCoreReg);
-  RegLocation rl_new_value = LoadValue(rl_src_new_value, kCoreReg);
+  RegLocation rl_new_value;
+  if (!is_long) {
+    rl_new_value = LoadValue(rl_src_new_value, kCoreReg);
+  } else if (load_early) {
+    rl_new_value = LoadValueWide(rl_src_new_value, kCoreReg);
+  }
 
   if (is_object && !mir_graph_->IsConstantNullRef(rl_new_value)) {
     // Mark card for object assuming new value is stored.
@@ -585,7 +630,7 @@
 
   RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
 
-  int r_ptr = AllocTemp();
+  int r_ptr = rARM_LR;
   OpRegRegReg(kOpAdd, r_ptr, rl_object.low_reg, rl_offset.low_reg);
 
   // Free now unneeded rl_object and rl_offset to give more temps.
@@ -594,29 +639,77 @@
   ClobberSReg(rl_offset.s_reg_low);
   FreeTemp(rl_offset.low_reg);
 
-  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  LoadConstant(rl_result.low_reg, 0);  // r_result := 0
+  RegLocation rl_expected;
+  if (!is_long) {
+    rl_expected = LoadValue(rl_src_expected, kCoreReg);
+  } else if (load_early) {
+    rl_expected = LoadValueWide(rl_src_expected, kCoreReg);
+  } else {
+    rl_new_value.low_reg = rl_expected.low_reg = AllocTemp();
+    rl_new_value.high_reg = rl_expected.high_reg = AllocTemp();
+  }
 
-  // while ([r_ptr] == rExpected && r_result == 0) {
-  //   [r_ptr] <- r_new_value && r_result := success ? 0 : 1
-  //   r_result ^= 1
-  // }
-  int r_old_value = AllocTemp();
+  // do {
+  //   tmp = [r_ptr] - expected;
+  // } while (tmp == 0 && failure([r_ptr] <- r_new_value));
+  // result = tmp != 0;
+
+  int r_tmp = AllocTemp();
   LIR* target = NewLIR0(kPseudoTargetLabel);
-  NewLIR3(kThumb2Ldrex, r_old_value, r_ptr, 0);
 
-  RegLocation rl_expected = LoadValue(rl_src_expected, kCoreReg);
-  OpRegReg(kOpCmp, r_old_value, rl_expected.low_reg);
-  FreeTemp(r_old_value);  // Now unneeded.
-  OpIT(kCondEq, "TT");
-  NewLIR4(kThumb2Strex /* eq */, rl_result.low_reg, rl_new_value.low_reg, r_ptr, 0);
-  FreeTemp(r_ptr);  // Now unneeded.
-  OpRegImm(kOpXor /* eq */, rl_result.low_reg, 1);
-  OpRegImm(kOpCmp /* eq */, rl_result.low_reg, 0);
+  if (is_long) {
+    int r_tmp_high = AllocTemp();
+    if (!load_early) {
+      LoadValueDirectWide(rl_src_expected, rl_expected.low_reg, rl_expected.high_reg);
+    }
+    NewLIR3(kThumb2Ldrexd, r_tmp, r_tmp_high, r_ptr);
+    OpRegReg(kOpSub, r_tmp, rl_expected.low_reg);
+    OpRegReg(kOpSub, r_tmp_high, rl_expected.high_reg);
+    if (!load_early) {
+      LoadValueDirectWide(rl_src_new_value, rl_new_value.low_reg, rl_new_value.high_reg);
+    }
+    // Make sure we use ORR that sets the ccode
+    if (ARM_LOWREG(r_tmp) && ARM_LOWREG(r_tmp_high)) {
+      NewLIR2(kThumbOrr, r_tmp, r_tmp_high);
+    } else {
+      NewLIR4(kThumb2OrrRRRs, r_tmp, r_tmp, r_tmp_high, 0);
+    }
+    FreeTemp(r_tmp_high);  // Now unneeded
+
+    DCHECK(last_lir_insn_->u.m.def_mask & ENCODE_CCODE);
+    OpIT(kCondEq, "T");
+    NewLIR4(kThumb2Strexd /* eq */, r_tmp, rl_new_value.low_reg, rl_new_value.high_reg, r_ptr);
+
+  } else {
+    NewLIR3(kThumb2Ldrex, r_tmp, r_ptr, 0);
+    OpRegReg(kOpSub, r_tmp, rl_expected.low_reg);
+    DCHECK(last_lir_insn_->u.m.def_mask & ENCODE_CCODE);
+    OpIT(kCondEq, "T");
+    NewLIR4(kThumb2Strex /* eq */, r_tmp, rl_new_value.low_reg, r_ptr, 0);
+  }
+
+  // Still one conditional left from OpIT(kCondEq, "T") from either branch
+  OpRegImm(kOpCmp /* eq */, r_tmp, 1);
   OpCondBranch(kCondEq, target);
 
+  if (!load_early) {
+    FreeTemp(rl_expected.low_reg);  // Now unneeded.
+    FreeTemp(rl_expected.high_reg);  // Now unneeded.
+  }
+
+  // result := (tmp1 != 0) ? 0 : 1;
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  OpRegRegImm(kOpRsub, rl_result.low_reg, r_tmp, 1);
+  DCHECK(last_lir_insn_->u.m.def_mask & ENCODE_CCODE);
+  OpIT(kCondCc, "");
+  LoadConstant(rl_result.low_reg, 0); /* cc */
+  FreeTemp(r_tmp);  // Now unneeded.
+
   StoreValue(rl_dest, rl_result);
 
+  // Now, restore lr to its non-temp status.
+  Clobber(rARM_LR);
+  UnmarkTemp(rARM_LR);
   return true;
 }
 
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index ad9b0de..f8a2d03 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -87,6 +87,7 @@
 #define REG_DEF0_USE01       (REG_DEF0 | REG_USE01)
 #define REG_DEF0_USE0        (REG_DEF0 | REG_USE0)
 #define REG_DEF0_USE12       (REG_DEF0 | REG_USE12)
+#define REG_DEF0_USE123      (REG_DEF0 | REG_USE123)
 #define REG_DEF0_USE1        (REG_DEF0 | REG_USE1)
 #define REG_DEF0_USE2        (REG_DEF0 | REG_USE2)
 #define REG_DEFAD_USEAD      (REG_DEFAD_USEA | REG_USED)
@@ -98,6 +99,7 @@
 #define REG_USE02            (REG_USE0 | REG_USE2)
 #define REG_USE12            (REG_USE1 | REG_USE2)
 #define REG_USE23            (REG_USE2 | REG_USE3)
+#define REG_USE123           (REG_USE1 | REG_USE2 | REG_USE3)
 
 struct BasicBlock;
 struct CallInfo;
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 936fb07..90d84d5 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -440,18 +440,34 @@
               if (op3 == 0) {   // op3 is 00, op4 is 00
                 opcode << "strex";
                 args << Rd << ", " << Rt << ", [" << Rn << ", #" << (imm8 << 2) << "]";
+                if (Rd.r == 13 || Rd.r == 15 || Rt.r == 13 || Rt.r == 15 || Rn.r == 15 ||
+                    Rd.r == Rn.r || Rd.r == Rt.r) {
+                  args << " (UNPREDICTABLE)";
+                }
               } else {          // op3 is 01, op4 is 00
                 // this is one of strexb, strexh or strexd
                 int op5 = (instr >> 4) & 0xf;
                 switch (op5) {
                   case 4:
-                    opcode << "strexb";
-                    break;
                   case 5:
-                    opcode << "strexh";
+                    opcode << ((op5 == 4) ? "strexb" : "strexh");
+                    Rd = ArmRegister(instr, 0);
+                    args << Rd << ", " << Rt << ", [" << Rn << "]";
+                    if (Rd.r == 13 || Rd.r == 15 || Rt.r == 13 || Rt.r == 15 || Rn.r == 15 ||
+                        Rd.r == Rn.r || Rd.r == Rt.r || (instr & 0xf00) != 0xf00) {
+                      args << " (UNPREDICTABLE)";
+                    }
                     break;
                   case 7:
                     opcode << "strexd";
+                    ArmRegister Rt2 = Rd;
+                    Rd = ArmRegister(instr, 0);
+                    args << Rd << ", " << Rt << ", " << Rt2 << ", [" << Rn << "]";
+                    if (Rd.r == 13 || Rd.r == 15 || Rt.r == 13 || Rt.r == 15 ||
+                        Rt2.r == 13 || Rt2.r == 15 || Rn.r == 15 ||
+                        Rd.r == Rn.r || Rd.r == Rt.r || Rd.r == Rt2.r) {
+                      args << " (UNPREDICTABLE)";
+                    }
                     break;
                 }
               }
@@ -460,6 +476,9 @@
               if (op3 == 0) {   // op3 is 00, op4 is 01
                 opcode << "ldrex";
                 args << Rt << ", [" << Rn << ", #" << (imm8 << 2) << "]";
+                if (Rt.r == 13 || Rt.r == 15 || Rn.r == 15 || (instr & 0xf00) != 0xf00) {
+                  args << " (UNPREDICTABLE)";
+                }
               } else {          // op3 is 01, op4 is 01
                 // this is one of strexb, strexh or strexd
                 int op5 = (instr >> 4) & 0xf;
@@ -471,13 +490,20 @@
                     opcode << "tbh";
                     break;
                   case 4:
-                    opcode << "ldrexb";
-                    break;
                   case 5:
-                    opcode << "ldrexh";
+                    opcode << ((op5 == 4) ? "ldrexb" : "ldrexh");
+                    args << Rt << ", [" << Rn << "]";
+                    if (Rt.r == 13 || Rt.r == 15 || Rn.r == 15 || (instr & 0xf0f) != 0xf0f) {
+                      args << " (UNPREDICTABLE)";
+                    }
                     break;
                   case 7:
                     opcode << "ldrexd";
+                    args << Rt << ", " << Rd /* Rt2 */ << ", [" << Rn << "]";
+                    if (Rt.r == 13 || Rt.r == 15 || Rd.r == 13 /* Rt2 */ || Rd.r == 15 /* Rt2 */ ||
+                        Rn.r == 15 || (instr & 0x00f) != 0x00f) {
+                      args << " (UNPREDICTABLE)";
+                    }
                     break;
                 }
               }
@@ -507,15 +533,6 @@
           }
         }
 
-
-        if (op3 == 0 && op4 == 0) {  // STREX
-          ArmRegister Rd(instr, 8);
-          opcode << "strex";
-          args << Rd << ", " << Rt << ", [" << Rn << ", #" << (imm8 << 2) << "]";
-        } else if (op3 == 0 && op4 == 1) {  // LDREX
-          opcode << "ldrex";
-          args << Rt << ", [" << Rn << ", #" << (imm8 << 2) << "]";
-        }
       } else if ((op2 & 0x60) == 0x20) {  // 01x xxxx
         // Data-processing (shifted register)
         // |111|1110|0000|0|0000|1111|1100|00|00|0000|