Implement GenInlinedReverseBits

Added support for x86 inlined version of reverse method of int and long

Change-Id: I7dbdc13b4afedd56557e9eff038a31517cdb1843
Signed-off-by: Yixin Shou <yixin.shou@intel.com>
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 24a3fe3..1f5b350 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -159,6 +159,7 @@
   bool GenInlinedCas(CallInfo* info, bool is_long, bool is_object) OVERRIDE;
   bool GenInlinedMinMax(CallInfo* info, bool is_min, bool is_long) OVERRIDE;
   bool GenInlinedMinMaxFP(CallInfo* info, bool is_min, bool is_double) OVERRIDE;
+  bool GenInlinedReverseBits(CallInfo* info, OpSize size) OVERRIDE;
   bool GenInlinedSqrt(CallInfo* info) OVERRIDE;
   bool GenInlinedAbsFloat(CallInfo* info) OVERRIDE;
   bool GenInlinedAbsDouble(CallInfo* info) OVERRIDE;
@@ -957,6 +958,9 @@
  private:
   // The number of vector registers [0..N] reserved by a call to ReserveVectorRegisters
   int num_reserved_vector_regs_;
+
+  void SwapBits(RegStorage result_reg, int shift, int32_t value);
+  void SwapBits64(RegStorage result_reg, int shift, int64_t value);
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index fdc46e2..afa2ae2 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -1061,6 +1061,83 @@
   return true;
 }
 
+void X86Mir2Lir::SwapBits(RegStorage result_reg, int shift, int32_t value) {
+  RegStorage r_temp = AllocTemp();
+  OpRegCopy(r_temp, result_reg);
+  OpRegImm(kOpLsr, result_reg, shift);
+  OpRegImm(kOpAnd, r_temp, value);
+  OpRegImm(kOpAnd, result_reg, value);
+  OpRegImm(kOpLsl, r_temp, shift);
+  OpRegReg(kOpOr, result_reg, r_temp);
+  FreeTemp(r_temp);
+}
+
+void X86Mir2Lir::SwapBits64(RegStorage result_reg, int shift, int64_t value) {
+  RegStorage r_temp = AllocTempWide();
+  OpRegCopy(r_temp, result_reg);
+  OpRegImm(kOpLsr, result_reg, shift);
+  RegStorage r_value = AllocTempWide();
+  LoadConstantWide(r_value, value);
+  OpRegReg(kOpAnd, r_temp, r_value);
+  OpRegReg(kOpAnd, result_reg, r_value);
+  OpRegImm(kOpLsl, r_temp, shift);
+  OpRegReg(kOpOr, result_reg, r_temp);
+  FreeTemp(r_temp);
+  FreeTemp(r_value);
+}
+
+bool X86Mir2Lir::GenInlinedReverseBits(CallInfo* info, OpSize size) {
+  RegLocation rl_src_i = info->args[0];
+  RegLocation rl_i = (size == k64) ? LoadValueWide(rl_src_i, kCoreReg)
+                                   : LoadValue(rl_src_i, kCoreReg);
+  RegLocation rl_dest = (size == k64) ? InlineTargetWide(info) : InlineTarget(info);
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  if (size == k64) {
+    if (cu_->instruction_set == kX86_64) {
+      /* Use one bswap instruction to reverse byte order first and then use 3 rounds of
+         swapping bits to reverse bits in a long number x. Using bswap to save instructions
+         compared to generic luni implementation which has 5 rounds of swapping bits.
+         x = bswap x
+         x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555;
+         x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333;
+         x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F;
+      */
+      OpRegReg(kOpRev, rl_result.reg, rl_i.reg);
+      SwapBits64(rl_result.reg, 1, 0x5555555555555555);
+      SwapBits64(rl_result.reg, 2, 0x3333333333333333);
+      SwapBits64(rl_result.reg, 4, 0x0f0f0f0f0f0f0f0f);
+      StoreValueWide(rl_dest, rl_result);
+      return true;
+    }
+    RegStorage r_i_low = rl_i.reg.GetLow();
+    if (rl_i.reg.GetLowReg() == rl_result.reg.GetLowReg()) {
+      // First REV shall clobber rl_result.reg.GetLowReg(), save the value in a temp for the second
+      // REV.
+      r_i_low = AllocTemp();
+      OpRegCopy(r_i_low, rl_i.reg);
+    }
+    OpRegReg(kOpRev, rl_result.reg.GetLow(), rl_i.reg.GetHigh());
+    OpRegReg(kOpRev, rl_result.reg.GetHigh(), r_i_low);
+    if (rl_i.reg.GetLowReg() == rl_result.reg.GetLowReg()) {
+      FreeTemp(r_i_low);
+    }
+    SwapBits(rl_result.reg.GetLow(), 1, 0x55555555);
+    SwapBits(rl_result.reg.GetLow(), 2, 0x33333333);
+    SwapBits(rl_result.reg.GetLow(), 4, 0x0f0f0f0f);
+    SwapBits(rl_result.reg.GetHigh(), 1, 0x55555555);
+    SwapBits(rl_result.reg.GetHigh(), 2, 0x33333333);
+    SwapBits(rl_result.reg.GetHigh(), 4, 0x0f0f0f0f);
+    StoreValueWide(rl_dest, rl_result);
+  } else {
+    OpRegReg(kOpRev, rl_result.reg, rl_i.reg);
+    SwapBits(rl_result.reg, 1, 0x55555555);
+    SwapBits(rl_result.reg, 2, 0x33333333);
+    SwapBits(rl_result.reg, 4, 0x0f0f0f0f);
+    StoreValue(rl_dest, rl_result);
+  }
+  return true;
+}
+
 LIR* X86Mir2Lir::OpPcRelLoad(RegStorage reg, LIR* target) {
   CHECK(base_of_code_ != nullptr);