Merge "AArch64: Implement InexpensiveConstant methods."

commit: 1a09d286bdb4abc9aae06dcf4a8b052648816e2b [log] [tgz]
author: Andreas Gampe <agampe@google.com> Mon Aug 04 23:22:12 2014 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Mon Aug 04 23:00:33 2014 +0000
tree: 292a26d26b4f5c49c6b255dda305772f812fe648
parent: c7975a33b9c8fb36e85c41162462aeec8e76232a [diff]
parent: c763e350da562b0c6bebf10599588d4901140e45 [diff]
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 2c587b8..4680f8f 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h

@@ -240,6 +240,7 @@
   void OpRegCopyWide(RegStorage dest, RegStorage src) OVERRIDE;
 
   bool InexpensiveConstantInt(int32_t value) OVERRIDE;
+  bool InexpensiveConstantInt(int32_t value, Instruction::Code opcode) OVERRIDE;
   bool InexpensiveConstantFloat(int32_t value) OVERRIDE;
   bool InexpensiveConstantLong(int64_t value) OVERRIDE;
   bool InexpensiveConstantDouble(int64_t value) OVERRIDE;

diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 9403d5e..cc0d967 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc

@@ -1192,22 +1192,7 @@
 
 void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                                      RegLocation rl_src1, RegLocation rl_src2) {
-  if ((opcode == Instruction::SUB_LONG) || (opcode == Instruction::SUB_LONG_2ADDR)) {
-    if (!rl_src2.is_const) {
-      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
-    }
-  } else {
-    // Associativity.
-    if (!rl_src2.is_const) {
-      DCHECK(rl_src1.is_const);
-      std::swap(rl_src1, rl_src2);
-    }
-  }
-  DCHECK(rl_src2.is_const);
-
   OpKind op = kOpBkpt;
-  int64_t val = mir_graph_->ConstantValueWide(rl_src2);
-
   switch (opcode) {
     case Instruction::ADD_LONG:
     case Instruction::ADD_LONG_2ADDR:
@@ -1233,6 +1218,20 @@
       LOG(FATAL) << "Unexpected opcode";
   }
 
+  if (op == kOpSub) {
+    if (!rl_src2.is_const) {
+      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+    }
+  } else {
+    // Associativity.
+    if (!rl_src2.is_const) {
+      DCHECK(rl_src1.is_const);
+      std::swap(rl_src1, rl_src2);
+    }
+  }
+  DCHECK(rl_src2.is_const);
+  int64_t val = mir_graph_->ConstantValueWide(rl_src2);
+
   rl_src1 = LoadValueWide(rl_src1, kCoreReg);
   RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
   OpRegRegImm64(op, rl_result.reg, rl_src1.reg, val);

diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index cd1840a..5326e74 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc

@@ -269,8 +269,47 @@
   return (n << 12 | imm_r << 6 | imm_s);
 }
 
+// Maximum number of instructions to use for encoding the immediate.
+static const int max_num_ops_per_const_load = 2;
+
+/**
+ * @brief Return the number of fast halfwords in the given uint64_t integer.
+ * @details The input integer is split into 4 halfwords (bits 0-15, 16-31, 32-47, 48-63). The
+ *   number of fast halfwords (halfwords that are either 0 or 0xffff) is returned. See below for
+ *   a more accurate description.
+ * @param value The input 64-bit integer.
+ * @return Return @c retval such that (retval & 0x7) is the maximum between n and m, where n is
+ *   the number of halfwords with all bits unset (0) and m is the number of halfwords with all bits
+ *   set (0xffff). Additionally (retval & 0x8) is set when m > n.
+ */
+static int GetNumFastHalfWords(uint64_t value) {
+  unsigned int num_0000_halfwords = 0;
+  unsigned int num_ffff_halfwords = 0;
+  for (int shift = 0; shift < 64; shift += 16) {
+    uint16_t halfword = static_cast<uint16_t>(value >> shift);
+    if (halfword == 0)
+      num_0000_halfwords++;
+    else if (halfword == UINT16_C(0xffff))
+      num_ffff_halfwords++;
+  }
+  if (num_0000_halfwords >= num_ffff_halfwords) {
+    DCHECK_LE(num_0000_halfwords, 4U);
+    return num_0000_halfwords;
+  } else {
+    DCHECK_LE(num_ffff_halfwords, 4U);
+    return num_ffff_halfwords | 0x8;
+  }
+}
+
+// The InexpensiveConstantXXX variants below are used in the promotion algorithm to determine how a
+// constant is considered for promotion. If the constant is "inexpensive" then the promotion
+// algorithm will give it a low priority for promotion, even when it is referenced many times in
+// the code.
+
 bool Arm64Mir2Lir::InexpensiveConstantInt(int32_t value) {
-  return false;  // (ModifiedImmediate(value) >= 0) || (ModifiedImmediate(~value) >= 0);
+  // A 32-bit int can always be loaded with 2 instructions (and without using the literal pool).
+  // We therefore return true and give it a low priority for promotion.
+  return true;
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantFloat(int32_t value) {
@@ -278,13 +317,70 @@
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantLong(int64_t value) {
-  return InexpensiveConstantInt(High32Bits(value)) && InexpensiveConstantInt(Low32Bits(value));
+  int num_slow_halfwords = 4 - (GetNumFastHalfWords(value) & 0x7);
+  if (num_slow_halfwords <= max_num_ops_per_const_load) {
+    return true;
+  }
+  return (EncodeLogicalImmediate(/*is_wide=*/true, value) >= 0);
 }
 
 bool Arm64Mir2Lir::InexpensiveConstantDouble(int64_t value) {
   return EncodeImmDouble(value) >= 0;
 }
 
+// The InexpensiveConstantXXX variants below are used to determine which A64 instructions to use
+// when one of the operands is an immediate (e.g. register version or immediate version of add).
+
+bool Arm64Mir2Lir::InexpensiveConstantInt(int32_t value, Instruction::Code opcode) {
+  switch (opcode) {
+  case Instruction::IF_EQ:
+  case Instruction::IF_NE:
+  case Instruction::IF_LT:
+  case Instruction::IF_GE:
+  case Instruction::IF_GT:
+  case Instruction::IF_LE:
+  case Instruction::ADD_INT:
+  case Instruction::ADD_INT_2ADDR:
+  case Instruction::SUB_INT:
+  case Instruction::SUB_INT_2ADDR:
+    // The code below is consistent with the implementation of OpRegRegImm().
+    {
+      int32_t abs_value = std::abs(value);
+      if (abs_value < 0x1000) {
+        return true;
+      } else if ((abs_value & UINT64_C(0xfff)) == 0 && ((abs_value >> 12) < 0x1000)) {
+        return true;
+      }
+      return false;
+    }
+  case Instruction::SHL_INT:
+  case Instruction::SHL_INT_2ADDR:
+  case Instruction::SHR_INT:
+  case Instruction::SHR_INT_2ADDR:
+  case Instruction::USHR_INT:
+  case Instruction::USHR_INT_2ADDR:
+    return true;
+  case Instruction::AND_INT:
+  case Instruction::AND_INT_2ADDR:
+  case Instruction::AND_INT_LIT16:
+  case Instruction::AND_INT_LIT8:
+  case Instruction::OR_INT:
+  case Instruction::OR_INT_2ADDR:
+  case Instruction::OR_INT_LIT16:
+  case Instruction::OR_INT_LIT8:
+  case Instruction::XOR_INT:
+  case Instruction::XOR_INT_2ADDR:
+  case Instruction::XOR_INT_LIT16:
+  case Instruction::XOR_INT_LIT8:
+    if (value == 0 || value == INT32_C(-1)) {
+      return true;
+    }
+    return (EncodeLogicalImmediate(/*is_wide=*/false, value) >= 0);
+  default:
+    return false;
+  }
+}
+
 /*
  * Load a immediate using one single instruction when possible; otherwise
  * use a pair of movz and movk instructions.
@@ -358,9 +454,6 @@
 
 // TODO: clean up the names. LoadConstantWide() should really be LoadConstantNoClobberWide().
 LIR* Arm64Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
-  // Maximum number of instructions to use for encoding the immediate.
-  const int max_num_ops = 2;
-
   if (r_dest.IsFloat()) {
     return LoadFPConstantValueWide(r_dest, value);
   }
@@ -378,19 +471,12 @@
   }
 
   // At least one in value's halfwords is not 0x0, nor 0xffff: find out how many.
-  int num_0000_halfwords = 0;
-  int num_ffff_halfwords = 0;
   uint64_t uvalue = static_cast<uint64_t>(value);
-  for (int shift = 0; shift < 64; shift += 16) {
-    uint16_t halfword = static_cast<uint16_t>(uvalue >> shift);
-    if (halfword == 0)
-      num_0000_halfwords++;
-    else if (halfword == UINT16_C(0xffff))
-      num_ffff_halfwords++;
-  }
-  int num_fast_halfwords = std::max(num_0000_halfwords, num_ffff_halfwords);
+  int num_fast_halfwords = GetNumFastHalfWords(uvalue);
+  int num_slow_halfwords = 4 - (num_fast_halfwords & 0x7);
+  bool more_ffff_halfwords = (num_fast_halfwords & 0x8) != 0;
 
-  if (num_fast_halfwords < 3) {
+  if (num_slow_halfwords > 1) {
     // A single movz/movn is not enough. Try the logical immediate route.
     int log_imm = EncodeLogicalImmediate(/*is_wide=*/true, value);
     if (log_imm >= 0) {
@@ -398,19 +484,19 @@
     }
   }
 
-  if (num_fast_halfwords >= 4 - max_num_ops) {
+  if (num_slow_halfwords <= max_num_ops_per_const_load) {
     // We can encode the number using a movz/movn followed by one or more movk.
     ArmOpcode op;
     uint16_t background;
     LIR* res = nullptr;
 
     // Decide whether to use a movz or a movn.
-    if (num_0000_halfwords >= num_ffff_halfwords) {
-      op = WIDE(kA64Movz3rdM);
-      background = 0;
-    } else {
+    if (more_ffff_halfwords) {
       op = WIDE(kA64Movn3rdM);
       background = 0xffff;
+    } else {
+      op = WIDE(kA64Movz3rdM);
+      background = 0;
     }
 
     // Emit the first instruction (movz, movn).
@@ -726,7 +812,7 @@
   int64_t abs_value = (neg) ? -value : value;
   ArmOpcode opcode = kA64Brk1d;
   ArmOpcode alt_opcode = kA64Brk1d;
-  int32_t log_imm = -1;
+  bool is_logical = false;
   bool is_wide = r_dest.Is64Bit();
   ArmOpcode wide = (is_wide) ? WIDE(0) : UNWIDE(0);
   int info = 0;
@@ -761,65 +847,89 @@
         opcode = (neg) ? kA64Add4RRdT : kA64Sub4RRdT;
         return NewLIR4(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), abs_value >> 12, 1);
       } else {
-        log_imm = -1;
         alt_opcode = (op == kOpAdd) ? kA64Add4RRre : kA64Sub4RRre;
         info = EncodeExtend(is_wide ? kA64Uxtx : kA64Uxtw, 0);
       }
       break;
-    // case kOpRsub:
-    //   opcode = kThumb2RsubRRI8M;
-    //   alt_opcode = kThumb2RsubRRR;
-    //   break;
     case kOpAdc:
-      log_imm = -1;
       alt_opcode = kA64Adc3rrr;
       break;
     case kOpSbc:
-      log_imm = -1;
       alt_opcode = kA64Sbc3rrr;
       break;
     case kOpOr:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64Orr3Rrl;
       alt_opcode = kA64Orr4rrro;
       break;
     case kOpAnd:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64And3Rrl;
       alt_opcode = kA64And4rrro;
       break;
     case kOpXor:
-      log_imm = EncodeLogicalImmediate(is_wide, value);
+      is_logical = true;
       opcode = kA64Eor3Rrl;
       alt_opcode = kA64Eor4rrro;
       break;
     case kOpMul:
       // TUNING: power of 2, shift & add
-      log_imm = -1;
       alt_opcode = kA64Mul3rrr;
       break;
     default:
       LOG(FATAL) << "Bad opcode: " << op;
   }
 
-  if (log_imm >= 0) {
-    return NewLIR3(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), log_imm);
-  } else {
-    RegStorage r_scratch;
-    if (is_wide) {
-      r_scratch = AllocTempWide();
-      LoadConstantWide(r_scratch, value);
+  if (is_logical) {
+    int log_imm = EncodeLogicalImmediate(is_wide, value);
+    if (log_imm >= 0) {
+      return NewLIR3(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), log_imm);
     } else {
-      r_scratch = AllocTemp();
-      LoadConstant(r_scratch, value);
+      // When the immediate is either 0 or ~0, the logical operation can be trivially reduced
+      // to a - possibly negated - assignment.
+      if (value == 0) {
+        switch (op) {
+          case kOpOr:
+          case kOpXor:
+            // Or/Xor by zero reduces to an assignment.
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          default:
+            // And by zero reduces to a `mov rdest, xzr'.
+            DCHECK(op == kOpAnd);
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), (is_wide) ? rxzr : rwzr);
+        }
+      } else if (value == INT64_C(-1)
+                 || (!is_wide && static_cast<uint32_t>(value) == ~UINT32_C(0))) {
+        switch (op) {
+          case kOpAnd:
+            // And by -1 reduces to an assignment.
+            return NewLIR2(kA64Mov2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          case kOpXor:
+            // Xor by -1 reduces to an `mvn rdest, rsrc'.
+            return NewLIR2(kA64Mvn2rr | wide, r_dest.GetReg(), r_src1.GetReg());
+          default:
+            // Or by -1 reduces to a `mvn rdest, xzr'.
+            DCHECK(op == kOpOr);
+            return NewLIR2(kA64Mvn2rr | wide, r_dest.GetReg(), (is_wide) ? rxzr : rwzr);
+        }
+      }
     }
-    if (EncodingMap[alt_opcode].flags & IS_QUAD_OP)
-      res = NewLIR4(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg(), info);
-    else
-      res = NewLIR3(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg());
-    FreeTemp(r_scratch);
-    return res;
   }
+
+  RegStorage r_scratch;
+  if (is_wide) {
+    r_scratch = AllocTempWide();
+    LoadConstantWide(r_scratch, value);
+  } else {
+    r_scratch = AllocTemp();
+    LoadConstant(r_scratch, value);
+  }
+  if (EncodingMap[alt_opcode].flags & IS_QUAD_OP)
+    res = NewLIR4(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg(), info);
+  else
+    res = NewLIR3(alt_opcode | wide, r_dest.GetReg(), r_src1.GetReg(), r_scratch.GetReg());
+  FreeTemp(r_scratch);
+  return res;
 }
 
 LIR* Arm64Mir2Lir::OpRegImm(OpKind op, RegStorage r_dest_src1, int value) {

diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index aae9155..3e03750 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc

@@ -256,7 +256,7 @@
     RegLocation rl_temp = UpdateLoc(rl_src2);
     int32_t constant_value = mir_graph_->ConstantValue(rl_src2);
     if ((rl_temp.location == kLocDalvikFrame) &&
-        InexpensiveConstantInt(constant_value)) {
+        InexpensiveConstantInt(constant_value, opcode)) {
       // OK - convert this to a compare immediate and branch
       OpCmpImmBranch(cond, rl_src1.reg, mir_graph_->ConstantValue(rl_src2), taken);
       return;

diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 4d8b91e..e519011 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc

@@ -926,11 +926,11 @@
     case Instruction::XOR_INT:
     case Instruction::XOR_INT_2ADDR:
       if (rl_src[0].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[0]))) {
+          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[0]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[1],
                              mir_graph_->ConstantValue(rl_src[0].orig_sreg));
       } else if (rl_src[1].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]))) {
+                 InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[0],
                              mir_graph_->ConstantValue(rl_src[1].orig_sreg));
       } else {
@@ -951,7 +951,7 @@
     case Instruction::USHR_INT:
     case Instruction::USHR_INT_2ADDR:
       if (rl_src[1].is_const &&
-          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]))) {
+          InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[0], mir_graph_->ConstantValue(rl_src[1]));
       } else {
         GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[1]);

diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 70c17da..d8b3083 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h

@@ -1445,6 +1445,9 @@
     virtual bool InexpensiveConstantFloat(int32_t value) = 0;
     virtual bool InexpensiveConstantLong(int64_t value) = 0;
     virtual bool InexpensiveConstantDouble(int64_t value) = 0;
+    virtual bool InexpensiveConstantInt(int32_t value, Instruction::Code opcode) {
+      return InexpensiveConstantInt(value);
+    }
 
     // May be optimized by targets.
     virtual void GenMonitorEnter(int opt_flags, RegLocation rl_src);

diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 45244e1..be966e1 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc

@@ -1171,12 +1171,13 @@
       } else {
         counts[p_map_idx].count += use_count;
       }
-    } else if (!IsInexpensiveConstant(loc)) {
+    } else {
       if (loc.wide && WideGPRsAreAliases()) {
-        // Longs and doubles can be counted together.
         i++;
       }
-      counts[p_map_idx].count += use_count;
+      if (!IsInexpensiveConstant(loc)) {
+        counts[p_map_idx].count += use_count;
+      }
     }
   }
 }
@@ -1185,9 +1186,10 @@
 static int SortCounts(const void *val1, const void *val2) {
   const Mir2Lir::RefCounts* op1 = reinterpret_cast<const Mir2Lir::RefCounts*>(val1);
   const Mir2Lir::RefCounts* op2 = reinterpret_cast<const Mir2Lir::RefCounts*>(val2);
-  // Note that we fall back to sorting on reg so we get stable output
-  // on differing qsort implementations (such as on host and target or
-  // between local host and build servers).
+  // Note that we fall back to sorting on reg so we get stable output on differing qsort
+  // implementations (such as on host and target or between local host and build servers).
+  // Note also that if a wide val1 and a non-wide val2 have the same count, then val1 always
+  // ``loses'' (as STARTING_WIDE_SREG is or-ed in val1->s_reg).
   return (op1->count == op2->count)
           ? (op1->s_reg - op2->s_reg)
           : (op1->count < op2->count ? 1 : -1);
@@ -1230,8 +1232,8 @@
    * TUNING: replace with linear scan once we have the ability
    * to describe register live ranges for GC.
    */
-  size_t core_reg_count_size = cu_->target64 ? num_regs * 2 : num_regs;
-  size_t fp_reg_count_size = num_regs * 2;
+  size_t core_reg_count_size = WideGPRsAreAliases() ? num_regs : num_regs * 2;
+  size_t fp_reg_count_size = WideFPRsAreAliases() ? num_regs : num_regs * 2;
   RefCounts *core_regs =
       static_cast<RefCounts*>(arena_->Alloc(sizeof(RefCounts) * core_reg_count_size,
                                             kArenaAllocRegAlloc));
@@ -1261,7 +1263,6 @@
   // Sum use counts of SSA regs by original Dalvik vreg.
   CountRefs(core_regs, fp_regs, num_regs);
 
-
   // Sort the count arrays
   qsort(core_regs, core_reg_count_size, sizeof(RefCounts), SortCounts);
   qsort(fp_regs, fp_reg_count_size, sizeof(RefCounts), SortCounts);
commit	1a09d286bdb4abc9aae06dcf4a8b052648816e2b	[log] [tgz]
author	Andreas Gampe <agampe@google.com>	Mon Aug 04 23:22:12 2014 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Mon Aug 04 23:00:33 2014 +0000
tree	292a26d26b4f5c49c6b255dda305772f812fe648
parent	c7975a33b9c8fb36e85c41162462aeec8e76232a [diff]
parent	c763e350da562b0c6bebf10599588d4901140e45 [diff]