ARM: Strength reduction for floating-point division

For floating-point division by power of two constants, generate
multiplication by the reciprocal instead.

Change-Id: I39c79eeb26b60cc754ad42045362b79498c755be
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 179ba02..d235199 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -90,6 +90,10 @@
     bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                             RegLocation rl_dest, int lit);
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
+    void GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                    int32_t constant) OVERRIDE;
+    void GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                     int64_t constant) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(QuickEntrypointEnum trampoline) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
diff --git a/compiler/dex/quick/arm/fp_arm.cc b/compiler/dex/quick/arm/fp_arm.cc
index 3eb7c83..2b2592d 100644
--- a/compiler/dex/quick/arm/fp_arm.cc
+++ b/compiler/dex/quick/arm/fp_arm.cc
@@ -113,6 +113,32 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+void ArmMir2Lir::GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                            int32_t constant) {
+  RegLocation rl_result;
+  RegStorage r_tmp = AllocTempSingle();
+  LoadConstantNoClobber(r_tmp, constant);
+  rl_src1 = LoadValue(rl_src1, kFPReg);
+  rl_result = EvalLoc(rl_dest, kFPReg, true);
+  NewLIR3(kThumb2Vmuls, rl_result.reg.GetReg(), rl_src1.reg.GetReg(), r_tmp.GetReg());
+  StoreValue(rl_dest, rl_result);
+}
+
+void ArmMir2Lir::GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                             int64_t constant) {
+  RegLocation rl_result;
+  RegStorage r_tmp = AllocTempDouble();
+  DCHECK(r_tmp.IsDouble());
+  LoadConstantWide(r_tmp, constant);
+  rl_src1 = LoadValueWide(rl_src1, kFPReg);
+  DCHECK(rl_src1.wide);
+  rl_result = EvalLocWide(rl_dest, kFPReg, true);
+  DCHECK(rl_dest.wide);
+  DCHECK(rl_result.wide);
+  NewLIR3(kThumb2Vmuld, rl_result.reg.GetReg(), rl_src1.reg.GetReg(), r_tmp.GetReg());
+  StoreValueWide(rl_dest, rl_result);
+}
+
 void ArmMir2Lir::GenConversion(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src) {
   int op = kThumbBkpt;
   int src_reg;
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index bd363c4..5182a89 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -71,6 +71,10 @@
   bool HandleEasyDivRem64(Instruction::Code dalvik_opcode, bool is_div,
                           RegLocation rl_src, RegLocation rl_dest, int64_t lit);
   bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
+  void GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                  int32_t constant) OVERRIDE;
+  void GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                   int64_t constant) OVERRIDE;
   LIR* CheckSuspendUsingLoad() OVERRIDE;
   RegStorage LoadHelper(QuickEntrypointEnum trampoline) OVERRIDE;
   LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
diff --git a/compiler/dex/quick/arm64/fp_arm64.cc b/compiler/dex/quick/arm64/fp_arm64.cc
index db24d12..ff692b7 100644
--- a/compiler/dex/quick/arm64/fp_arm64.cc
+++ b/compiler/dex/quick/arm64/fp_arm64.cc
@@ -116,6 +116,32 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+void Arm64Mir2Lir::GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                              int32_t constant) {
+  RegLocation rl_result;
+  RegStorage r_tmp = AllocTempSingle();
+  LoadConstantNoClobber(r_tmp, constant);
+  rl_src1 = LoadValue(rl_src1, kFPReg);
+  rl_result = EvalLoc(rl_dest, kFPReg, true);
+  NewLIR3(kA64Fmul3fff, rl_result.reg.GetReg(), rl_src1.reg.GetReg(), r_tmp.GetReg());
+  StoreValue(rl_dest, rl_result);
+}
+
+void Arm64Mir2Lir::GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                               int64_t constant) {
+  RegLocation rl_result;
+  RegStorage r_tmp = AllocTempDouble();
+  DCHECK(r_tmp.IsDouble());
+  LoadConstantWide(r_tmp, constant);
+  rl_src1 = LoadValueWide(rl_src1, kFPReg);
+  DCHECK(rl_src1.wide);
+  rl_result = EvalLocWide(rl_dest, kFPReg, true);
+  DCHECK(rl_dest.wide);
+  DCHECK(rl_result.wide);
+  NewLIR3(WIDE(kA64Fmul3fff), rl_result.reg.GetReg(), rl_src1.reg.GetReg(), r_tmp.GetReg());
+  StoreValueWide(rl_dest, rl_result);
+}
+
 void Arm64Mir2Lir::GenConversion(Instruction::Code opcode,
                                  RegLocation rl_dest, RegLocation rl_src) {
   int op = kA64Brk1d;
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index c5aa27c..061ee07 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1785,6 +1785,34 @@
   return true;
 }
 
+// Returns true if it generates instructions.
+bool Mir2Lir::HandleEasyFloatingPointDiv(RegLocation rl_dest, RegLocation rl_src1,
+                                         RegLocation rl_src2) {
+  if (!rl_src2.is_const ||
+      ((cu_->instruction_set != kThumb2) && (cu_->instruction_set != kArm64))) {
+    return false;
+  }
+
+  if (!rl_src2.wide) {
+    int32_t divisor = mir_graph_->ConstantValue(rl_src2);
+    if (CanDivideByReciprocalMultiplyFloat(divisor)) {
+      // Generate multiply by reciprocal instead of div.
+      float recip = 1.0f/bit_cast<int32_t, float>(divisor);
+      GenMultiplyByConstantFloat(rl_dest, rl_src1, bit_cast<float, int32_t>(recip));
+      return true;
+    }
+  } else {
+    int64_t divisor = mir_graph_->ConstantValueWide(rl_src2);
+    if (CanDivideByReciprocalMultiplyDouble(divisor)) {
+      // Generate multiply by reciprocal instead of div.
+      double recip = 1.0/bit_cast<double, int64_t>(divisor);
+      GenMultiplyByConstantDouble(rl_dest, rl_src1, bit_cast<double, int64_t>(recip));
+      return true;
+    }
+  }
+  return false;
+}
+
 void Mir2Lir::GenArithOpIntLit(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src,
                                int lit) {
   RegLocation rl_result;
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index dc6930c..7e9d80d 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -31,6 +31,10 @@
     bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                             RegLocation rl_dest, int lit);
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
+    void GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                    int32_t constant) OVERRIDE;
+    void GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                     int64_t constant) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(QuickEntrypointEnum trampoline) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
diff --git a/compiler/dex/quick/mips/fp_mips.cc b/compiler/dex/quick/mips/fp_mips.cc
index 4315915..0a7aa99 100644
--- a/compiler/dex/quick/mips/fp_mips.cc
+++ b/compiler/dex/quick/mips/fp_mips.cc
@@ -113,6 +113,20 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+void MipsMir2Lir::GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                             int32_t constant) {
+  // TODO: need mips implementation.
+  UNUSED(rl_dest, rl_src1, constant);
+  LOG(FATAL) << "Unimplemented GenMultiplyByConstantFloat in mips";
+}
+
+void MipsMir2Lir::GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                              int64_t constant) {
+  // TODO: need mips implementation.
+  UNUSED(rl_dest, rl_src1, constant);
+  LOG(FATAL) << "Unimplemented GenMultiplyByConstantDouble in mips";
+}
+
 void MipsMir2Lir::GenConversion(Instruction::Code opcode, RegLocation rl_dest,
                                 RegLocation rl_src) {
   int op = kMipsNop;
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 533a677..ccaa167 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -1052,28 +1052,36 @@
       }
       break;
 
+    case Instruction::DIV_FLOAT:
+    case Instruction::DIV_FLOAT_2ADDR:
+      if (HandleEasyFloatingPointDiv(rl_dest, rl_src[0], rl_src[1])) {
+        break;
+      }
+      FALLTHROUGH_INTENDED;
     case Instruction::ADD_FLOAT:
     case Instruction::SUB_FLOAT:
     case Instruction::MUL_FLOAT:
-    case Instruction::DIV_FLOAT:
     case Instruction::REM_FLOAT:
     case Instruction::ADD_FLOAT_2ADDR:
     case Instruction::SUB_FLOAT_2ADDR:
     case Instruction::MUL_FLOAT_2ADDR:
-    case Instruction::DIV_FLOAT_2ADDR:
     case Instruction::REM_FLOAT_2ADDR:
       GenArithOpFloat(opcode, rl_dest, rl_src[0], rl_src[1]);
       break;
 
+    case Instruction::DIV_DOUBLE:
+    case Instruction::DIV_DOUBLE_2ADDR:
+      if (HandleEasyFloatingPointDiv(rl_dest, rl_src[0], rl_src[1])) {
+        break;
+      }
+      FALLTHROUGH_INTENDED;
     case Instruction::ADD_DOUBLE:
     case Instruction::SUB_DOUBLE:
     case Instruction::MUL_DOUBLE:
-    case Instruction::DIV_DOUBLE:
     case Instruction::REM_DOUBLE:
     case Instruction::ADD_DOUBLE_2ADDR:
     case Instruction::SUB_DOUBLE_2ADDR:
     case Instruction::MUL_DOUBLE_2ADDR:
-    case Instruction::DIV_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE_2ADDR:
       GenArithOpDouble(opcode, rl_dest, rl_src[0], rl_src[1]);
       break;
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 4623f79..bacc6d2 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -789,6 +789,7 @@
     virtual bool HandleEasyDivRem(Instruction::Code dalvik_opcode, bool is_div,
                                   RegLocation rl_src, RegLocation rl_dest, int lit);
     bool HandleEasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit);
+    bool HandleEasyFloatingPointDiv(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
     virtual void HandleSlowPaths();
     void GenBarrier();
     void GenDivZeroException();
@@ -1120,6 +1121,10 @@
     virtual bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div,
                                     RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
     virtual bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
+    virtual void GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                            int32_t constant) = 0;
+    virtual void GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                             int64_t constant) = 0;
     virtual LIR* CheckSuspendUsingLoad() = 0;
 
     virtual RegStorage LoadHelper(QuickEntrypointEnum trampoline) = 0;
@@ -1439,6 +1444,26 @@
       return InexpensiveConstantInt(value);
     }
 
+    /**
+     * @brief Whether division by the given divisor can be converted to multiply by its reciprocal.
+     * @param divisor A constant divisor bits of float type.
+     * @return Returns true iff, x/divisor == x*(1.0f/divisor), for every float x.
+     */
+    bool CanDivideByReciprocalMultiplyFloat(int32_t divisor) {
+      // True, if float value significand bits are 0.
+      return ((divisor & 0x7fffff) == 0);
+    }
+
+    /**
+     * @brief Whether division by the given divisor can be converted to multiply by its reciprocal.
+     * @param divisor A constant divisor bits of double type.
+     * @return Returns true iff, x/divisor == x*(1.0/divisor), for every double x.
+     */
+    bool CanDivideByReciprocalMultiplyDouble(int64_t divisor) {
+      // True, if double value significand bits are 0.
+      return ((divisor & ((UINT64_C(1) << 52) - 1)) == 0);
+    }
+
     // May be optimized by targets.
     virtual void GenMonitorEnter(int opt_flags, RegLocation rl_src);
     virtual void GenMonitorExit(int opt_flags, RegLocation rl_src);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index dec99ae..4412a1e 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -78,6 +78,10 @@
   bool SmallLiteralDivRem(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
                           RegLocation rl_dest, int lit) OVERRIDE;
   bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
+  void GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                  int32_t constant) OVERRIDE;
+  void GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                   int64_t constant) OVERRIDE;
   LIR* CheckSuspendUsingLoad() OVERRIDE;
   RegStorage LoadHelper(QuickEntrypointEnum trampoline) OVERRIDE;
   LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index 254d90f..33bb0ee 100755
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -122,6 +122,20 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+void X86Mir2Lir::GenMultiplyByConstantFloat(RegLocation rl_dest, RegLocation rl_src1,
+                                            int32_t constant) {
+  // TODO: need x86 implementation.
+  UNUSED(rl_dest, rl_src1, constant);
+  LOG(FATAL) << "Unimplemented GenMultiplyByConstantFloat in x86";
+}
+
+void X86Mir2Lir::GenMultiplyByConstantDouble(RegLocation rl_dest, RegLocation rl_src1,
+                                             int64_t constant) {
+  // TODO: need x86 implementation.
+  UNUSED(rl_dest, rl_src1, constant);
+  LOG(FATAL) << "Unimplemented GenMultiplyByConstantDouble in x86";
+}
+
 void X86Mir2Lir::GenLongToFP(RegLocation rl_dest, RegLocation rl_src, bool is_double) {
   // Compute offsets to the source and destination VRs on stack
   int src_v_reg_offset = SRegOffset(rl_src.s_reg_low);