Merge "Remove some SOA from JNI internal." into dalvik-dev
diff --git a/src/compiler/codegen/arm/fp_arm.cc b/src/compiler/codegen/arm/fp_arm.cc
index 5e0e73d..57c55cc 100644
--- a/src/compiler/codegen/arm/fp_arm.cc
+++ b/src/compiler/codegen/arm/fp_arm.cc
@@ -259,6 +259,7 @@
   if (is_double) {
     rl_src1 = LoadValueWide(cu, rl_src1, kFPReg);
     rl_src2 = LoadValueWide(cu, rl_src2, kFPReg);
+    // In case result vreg is also a src vreg, break association to avoid useless copy by EvalLoc()
     ClobberSReg(cu, rl_dest.s_reg_low);
     rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
     LoadConstant(cu, rl_result.low_reg, default_result);
@@ -267,6 +268,7 @@
   } else {
     rl_src1 = LoadValue(cu, rl_src1, kFPReg);
     rl_src2 = LoadValue(cu, rl_src2, kFPReg);
+    // In case result vreg is also a srcvreg, break association to avoid useless copy by EvalLoc()
     ClobberSReg(cu, rl_dest.s_reg_low);
     rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
     LoadConstant(cu, rl_result.low_reg, default_result);
diff --git a/src/compiler/codegen/codegen.h b/src/compiler/codegen/codegen.h
index 0698156..e512803 100644
--- a/src/compiler/codegen/codegen.h
+++ b/src/compiler/codegen/codegen.h
@@ -211,6 +211,7 @@
     bool GenInlinedDoubleCvt(CompilationUnit *cu, CallInfo* info);
     bool GenInlinedIndexOf(CompilationUnit* cu, CallInfo* info, bool zero_based);
     bool GenInlinedStringCompareTo(CompilationUnit* cu, CallInfo* info);
+    bool GenInlinedCurrentThread(CompilationUnit* cu, CallInfo* info);
     bool GenIntrinsic(CompilationUnit* cu, CallInfo* info);
 
     // Shared by all targets - implemented in gen_loadstore.cc.
diff --git a/src/compiler/codegen/gen_invoke.cc b/src/compiler/codegen/gen_invoke.cc
index 41924e2..afaa053 100644
--- a/src/compiler/codegen/gen_invoke.cc
+++ b/src/compiler/codegen/gen_invoke.cc
@@ -18,6 +18,7 @@
 #include "../compiler_ir.h"
 #include "ralloc_util.h"
 #include "codegen_util.h"
+#include "x86/codegen_x86.h"
 
 namespace art {
 
@@ -1105,6 +1106,20 @@
   return true;
 }
 
+bool Codegen::GenInlinedCurrentThread(CompilationUnit* cu, CallInfo* info) {
+  RegLocation rl_dest = InlineTarget(cu, info);
+  RegLocation rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
+  int offset = Thread::PeerOffset().Int32Value();
+  if (cu->instruction_set == kThumb2) {
+    LoadWordDisp(cu, TargetReg(kSelf), offset, rl_result.low_reg);
+  } else {
+    CHECK(cu->instruction_set == kX86);
+    ((X86Codegen*)this)->OpRegThreadMem(cu, kOpMov, rl_result.low_reg, offset);
+  }
+  StoreValue(cu, rl_dest, rl_result);
+  return true;
+}
+
 bool Codegen::GenIntrinsic(CompilationUnit* cu, CallInfo* info)
 {
   if (info->opt_flags & MIR_INLINED) {
@@ -1172,6 +1187,9 @@
     if (tgt_method == "int java.lang.String.length()") {
       return GenInlinedStringIsEmptyOrLength(cu, info, false /* is_empty */);
     }
+    if (tgt_method == "java.lang.Thread java.lang.Thread.currentThread()") {
+      return GenInlinedCurrentThread(cu, info);
+    }
   } else if (tgt_method.find("boolean sun.misc.Unsafe.compareAndSwap") != std::string::npos) {
     if (tgt_method == "boolean sun.misc.Unsafe.compareAndSwapInt(java.lang.Object, long, int, int)") {
       return GenInlinedCas32(cu, info, false);
diff --git a/src/compiler/codegen/local_optimizations.cc b/src/compiler/codegen/local_optimizations.cc
index ec915f0..69b5d8e 100644
--- a/src/compiler/codegen/local_optimizations.cc
+++ b/src/compiler/codegen/local_optimizations.cc
@@ -20,7 +20,7 @@
 
 #define DEBUG_OPT(X)
 
-/* Check RAW, WAR, and WAR dependency on the register operands */
+/* Check RAW, WAR, and RAW dependency on the register operands */
 #define CHECK_REG_DEP(use, def, check) ((def & check->use_mask) || \
                                         ((use | def) & check->def_mask))
 
diff --git a/src/compiler/codegen/ralloc_util.cc b/src/compiler/codegen/ralloc_util.cc
index 999c652..1a3a413 100644
--- a/src/compiler/codegen/ralloc_util.cc
+++ b/src/compiler/codegen/ralloc_util.cc
@@ -124,7 +124,17 @@
   }
 }
 
-/* Clobber any temp associated with an s_reg.  Could be in either class */
+/*
+ * Break the association between a Dalvik vreg and a physical temp register of either register
+ * class.
+ * TODO: Ideally, the public version of this code should not exist.  Besides its local usage
+ * in the register utilities, is is also used by code gen routines to work around a deficiency in
+ * local register allocation, which fails to distinguish between the "in" and "out" identities
+ * of Dalvik vregs.  This can result in useless register copies when the same Dalvik vreg
+ * is used both as the source and destination register of an operation in which the type
+ * changes (for example: INT_TO_FLOAT v1, v1).  Revisit when improved register allocation is
+ * addressed.
+ */
 void ClobberSReg(CompilationUnit* cu, int s_reg)
 {
 #ifndef NDEBUG
diff --git a/src/compiler/codegen/x86/codegen_x86.h b/src/compiler/codegen/x86/codegen_x86.h
index dba4953..4ef186a 100644
--- a/src/compiler/codegen/x86/codegen_x86.h
+++ b/src/compiler/codegen/x86/codegen_x86.h
@@ -18,6 +18,7 @@
 #define ART_SRC_COMPILER_CODEGEN_X86_CODEGENX86_H_
 
 #include "../../compiler_internals.h"
+#include "x86_lir.h"
 
 namespace art {
 
diff --git a/src/compiler/codegen/x86/fp_x86.cc b/src/compiler/codegen/x86/fp_x86.cc
index 78c737d..6bfe9a2 100644
--- a/src/compiler/codegen/x86/fp_x86.cc
+++ b/src/compiler/codegen/x86/fp_x86.cc
@@ -158,6 +158,7 @@
     case Instruction::FLOAT_TO_INT: {
       rl_src = LoadValue(cu, rl_src, kFPReg);
       src_reg = rl_src.low_reg;
+      // In case result vreg is also src vreg, break association to avoid useless copy by EvalLoc()
       ClobberSReg(cu, rl_dest.s_reg_low);
       rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
       int temp_reg = AllocTempFloat(cu);
@@ -179,6 +180,7 @@
     case Instruction::DOUBLE_TO_INT: {
       rl_src = LoadValueWide(cu, rl_src, kFPReg);
       src_reg = rl_src.low_reg;
+      // In case result vreg is also src vreg, break association to avoid useless copy by EvalLoc()
       ClobberSReg(cu, rl_dest.s_reg_low);
       rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
       int temp_reg = AllocTempDouble(cu) | X86_FP_DOUBLE;
@@ -245,6 +247,7 @@
     rl_src2 = LoadValueWide(cu, rl_src2, kFPReg);
     src_reg2 = S2d(rl_src2.low_reg, rl_src2.high_reg);
   }
+  // In case result vreg is also src vreg, break association to avoid useless copy by EvalLoc()
   ClobberSReg(cu, rl_dest.s_reg_low);
   RegLocation rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
   LoadConstantNoClobber(cu, rl_result.low_reg, unordered_gt ? 1 : 0);
diff --git a/src/compiler/codegen/x86/int_x86.cc b/src/compiler/codegen/x86/int_x86.cc
index 190208b..bd3a7fa 100644
--- a/src/compiler/codegen/x86/int_x86.cc
+++ b/src/compiler/codegen/x86/int_x86.cc
@@ -43,16 +43,6 @@
  *    x = y     return  0
  *    x < y     return -1
  *    x > y     return  1
- *
- *    slt   t0,  x.hi, y.hi;        # (x.hi < y.hi) ? 1:0
- *    sgt   t1,  x.hi, y.hi;        # (y.hi > x.hi) ? 1:0
- *    subu  res, t0, t1             # res = -1:1:0 for [ < > = ]
- *    bnez  res, finish
- *    sltu  t0, x.lo, y.lo
- *    sgtu  r1, x.lo, y.lo
- *    subu  res, t0, t1
- * finish:
- *
  */
 void X86Codegen::GenCmpLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
                             RegLocation rl_src2)
@@ -335,6 +325,8 @@
 bool X86Codegen::GenAddLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
                          RegLocation rl_src2)
 {
+  // TODO: fixed register usage here as we only have 4 temps and temporary allocation isn't smart
+  // enough.
   FlushAllRegs(cu);
   LockCallTemps(cu);  // Prepare for explicit register usage
   LoadValueDirectWideFixed(cu, rl_src1, r0, r1);
@@ -351,6 +343,8 @@
 bool X86Codegen::GenSubLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
                             RegLocation rl_src2)
 {
+  // TODO: fixed register usage here as we only have 4 temps and temporary allocation isn't smart
+  // enough.
   FlushAllRegs(cu);
   LockCallTemps(cu);  // Prepare for explicit register usage
   LoadValueDirectWideFixed(cu, rl_src1, r0, r1);
@@ -367,13 +361,15 @@
 bool X86Codegen::GenAndLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
                             RegLocation rl_src2)
 {
+  // TODO: fixed register usage here as we only have 4 temps and temporary allocation isn't smart
+  // enough.
   FlushAllRegs(cu);
   LockCallTemps(cu);  // Prepare for explicit register usage
   LoadValueDirectWideFixed(cu, rl_src1, r0, r1);
   LoadValueDirectWideFixed(cu, rl_src2, r2, r3);
-  // Compute (r1:r0) = (r1:r0) + (r2:r3)
-  OpRegReg(cu, kOpAnd, r0, r2);  // r0 = r0 - r2
-  OpRegReg(cu, kOpAnd, r1, r3);  // r1 = r1 - r3 - CF
+  // Compute (r1:r0) = (r1:r0) & (r2:r3)
+  OpRegReg(cu, kOpAnd, r0, r2);  // r0 = r0 & r2
+  OpRegReg(cu, kOpAnd, r1, r3);  // r1 = r1 & r3
   RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(cu, rl_dest, rl_result);
@@ -383,13 +379,15 @@
 bool X86Codegen::GenOrLong(CompilationUnit* cu, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2)
 {
+  // TODO: fixed register usage here as we only have 4 temps and temporary allocation isn't smart
+  // enough.
   FlushAllRegs(cu);
   LockCallTemps(cu);  // Prepare for explicit register usage
   LoadValueDirectWideFixed(cu, rl_src1, r0, r1);
   LoadValueDirectWideFixed(cu, rl_src2, r2, r3);
-  // Compute (r1:r0) = (r1:r0) + (r2:r3)
-  OpRegReg(cu, kOpOr, r0, r2);  // r0 = r0 - r2
-  OpRegReg(cu, kOpOr, r1, r3);  // r1 = r1 - r3 - CF
+  // Compute (r1:r0) = (r1:r0) | (r2:r3)
+  OpRegReg(cu, kOpOr, r0, r2);  // r0 = r0 | r2
+  OpRegReg(cu, kOpOr, r1, r3);  // r1 = r1 | r3
   RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(cu, rl_dest, rl_result);
@@ -399,13 +397,15 @@
 bool X86Codegen::GenXorLong(CompilationUnit* cu, RegLocation rl_dest,
                             RegLocation rl_src1, RegLocation rl_src2)
 {
+  // TODO: fixed register usage here as we only have 4 temps and temporary allocation isn't smart
+  // enough.
   FlushAllRegs(cu);
   LockCallTemps(cu);  // Prepare for explicit register usage
   LoadValueDirectWideFixed(cu, rl_src1, r0, r1);
   LoadValueDirectWideFixed(cu, rl_src2, r2, r3);
-  // Compute (r1:r0) = (r1:r0) + (r2:r3)
-  OpRegReg(cu, kOpXor, r0, r2);  // r0 = r0 - r2
-  OpRegReg(cu, kOpXor, r1, r3);  // r1 = r1 - r3 - CF
+  // Compute (r1:r0) = (r1:r0) ^ (r2:r3)
+  OpRegReg(cu, kOpXor, r0, r2);  // r0 = r0 ^ r2
+  OpRegReg(cu, kOpXor, r1, r3);  // r1 = r1 ^ r3
   RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, r0, r1,
                           INVALID_SREG, INVALID_SREG};
   StoreValueWide(cu, rl_dest, rl_result);
@@ -431,6 +431,7 @@
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
   case kOpCmp: opcode = kX86Cmp32RT;  break;
+  case kOpMov: opcode = kX86Mov32RT;  break;
   default:
     LOG(FATAL) << "Bad opcode: " << op;
     break;
diff --git a/src/thread.cc b/src/thread.cc
index 72ceaf0..75d0468 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -1636,6 +1636,7 @@
   DO_THREAD_OFFSET(state_and_flags_);
   DO_THREAD_OFFSET(card_table_);
   DO_THREAD_OFFSET(exception_);
+  DO_THREAD_OFFSET(opeer_);
   DO_THREAD_OFFSET(jni_env_);
   DO_THREAD_OFFSET(self_);
   DO_THREAD_OFFSET(stack_end_);
diff --git a/src/thread.h b/src/thread.h
index 4c065c5..8b9c81d 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -442,6 +442,10 @@
     return ThreadOffset(OFFSETOF_MEMBER(Thread, exception_));
   }
 
+  static ThreadOffset PeerOffset() {
+    return ThreadOffset(OFFSETOF_MEMBER(Thread, opeer_));
+  }
+
   static ThreadOffset ThinLockIdOffset() {
     return ThreadOffset(OFFSETOF_MEMBER(Thread, thin_lock_id_));
   }