Support hardware divide instruction

Bug: 11299025

Uses sdiv for division and a combo of sdiv, mul and sub for modulus.
Only does this on processors that are capable of the sdiv instruction, as determined
by the build system.

Also provides a command line arg --instruction-set-features= to allow cross compilation.
Makefile adds the --instruction-set-features= arg to build-time dex2oat runs and defaults
it to something obtained from the target architecture.

Provides a GetInstructionSetFeatures() function on CompilerDriver that can be
queried for various features.  The only feature supported right now is hasDivideInstruction().

Also adds a few more instructions to the ARM disassembler

b/11535253 is an addition to this CL to be done later.

Change-Id: Ia8aaf801fd94bc71e476902749cf20f74eba9f68
diff --git a/compiler/dex/compiler_ir.h b/compiler/dex/compiler_ir.h
index 0d7209e..fd46975 100644
--- a/compiler/dex/compiler_ir.h
+++ b/compiler/dex/compiler_ir.h
@@ -97,6 +97,9 @@
   CompilerBackend compiler_backend;
   InstructionSet instruction_set;
 
+  const InstructionSetFeatures& GetInstructionSetFeatures() {
+    return compiler_driver->GetInstructionSetFeatures();
+  }
   // TODO: much of this info available elsewhere.  Go to the original source?
   uint16_t num_dalvik_registers;        // method->registers_size.
   const uint16_t* insns;
diff --git a/compiler/dex/quick/arm/arm_lir.h b/compiler/dex/quick/arm/arm_lir.h
index 2ff7f1c..ffaaf84 100644
--- a/compiler/dex/quick/arm/arm_lir.h
+++ b/compiler/dex/quick/arm/arm_lir.h
@@ -380,6 +380,8 @@
   kThumb2CmnRR,      // cmn [111010110001] rn[19..16] [0000] [1111] [0000] rm[3..0].
   kThumb2EorRRR,     // eor [111010101000] rn[19..16] [0000] rd[11..8] [0000] rm[3..0].
   kThumb2MulRRR,     // mul [111110110000] rn[19..16] [1111] rd[11..8] [0000] rm[3..0].
+  kThumb2SdivRRR,    // sdiv [111110111001] rn[19..16] [1111] rd[11..8] [1111] rm[3..0].
+  kThumb2UdivRRR,    // udiv [111110111011] rn[19..16] [1111] rd[11..8] [1111] rm[3..0].
   kThumb2MnvRR,      // mvn [11101010011011110] rd[11-8] [0000] rm[3..0].
   kThumb2RsubRRI8,   // rsub [111100011100] rn[19..16] [0000] rd[11..8] imm8[7..0].
   kThumb2NegRR,      // actually rsub rd, rn, #0.
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index e8c188c..3d0f263 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -687,6 +687,14 @@
                  kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16, kFmtBitBlt, 3, 0,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
                  "mul", "!0C, !1C, !2C", 4, kFixupNone),
+    ENCODING_MAP(kThumb2SdivRRR,  0xfb90f0f0,
+                 kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16, kFmtBitBlt, 3, 0,
+                 kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
+                 "sdiv", "!0C, !1C, !2C", 4, kFixupNone),
+    ENCODING_MAP(kThumb2UdivRRR,  0xfbb0f0f0,
+                 kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16, kFmtBitBlt, 3, 0,
+                 kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
+                 "udiv", "!0C, !1C, !2C", 4, kFixupNone),
     ENCODING_MAP(kThumb2MnvRR,  0xea6f0000,
                  kFmtBitBlt, 11, 8, kFmtBitBlt, 3, 0, kFmtShift, -1, -1,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE1,
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 0a8cbf9..42bf3d4 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -466,14 +466,39 @@
 
 RegLocation ArmMir2Lir::GenDivRemLit(RegLocation rl_dest, int reg1, int lit,
                                      bool is_div) {
-  LOG(FATAL) << "Unexpected use of GenDivRemLit for Arm";
-  return rl_dest;
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+
+  // Put the literal in a temp.
+  int lit_temp = AllocTemp();
+  LoadConstant(lit_temp, lit);
+  // Use the generic case for div/rem with arg2 in a register.
+  // TODO: The literal temp can be freed earlier during a modulus to reduce reg pressure.
+  rl_result = GenDivRem(rl_result, reg1, lit_temp, is_div);
+  FreeTemp(lit_temp);
+
+  return rl_result;
 }
 
 RegLocation ArmMir2Lir::GenDivRem(RegLocation rl_dest, int reg1, int reg2,
                                   bool is_div) {
-  LOG(FATAL) << "Unexpected use of GenDivRem for Arm";
-  return rl_dest;
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  if (is_div) {
+    // Simple case, use sdiv instruction.
+    OpRegRegReg(kOpDiv, rl_result.low_reg, reg1, reg2);
+  } else {
+    // Remainder case, use the following code:
+    // temp = reg1 / reg2      - integer division
+    // temp = temp * reg2
+    // dest = reg1 - temp
+
+    int temp = AllocTemp();
+    OpRegRegReg(kOpDiv, temp, reg1, reg2);
+    OpRegReg(kOpMul, temp, reg2);
+    OpRegRegReg(kOpSub, rl_result.low_reg, reg1, temp);
+    FreeTemp(temp);
+  }
+
+  return rl_result;
 }
 
 bool ArmMir2Lir::GenInlinedMinMaxInt(CallInfo* info, bool is_min) {
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 3ceeacf..d631cf7 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -395,6 +395,10 @@
       DCHECK_EQ(shift, 0);
       opcode = kThumb2MulRRR;
       break;
+    case kOpDiv:
+      DCHECK_EQ(shift, 0);
+      opcode = kThumb2SdivRRR;
+      break;
     case kOpOr:
       opcode = kThumb2OrrRRR;
       break;
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 2b3404a..df6493d 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1307,6 +1307,7 @@
     }
     StoreValue(rl_dest, rl_result);
   } else {
+    bool done = false;      // Set to true if we happen to find a way to use a real instruction.
     if (cu_->instruction_set == kMips) {
       rl_src1 = LoadValue(rl_src1, kCoreReg);
       rl_src2 = LoadValue(rl_src2, kCoreReg);
@@ -1314,7 +1315,23 @@
           GenImmedCheck(kCondEq, rl_src2.low_reg, 0, kThrowDivZero);
       }
       rl_result = GenDivRem(rl_dest, rl_src1.low_reg, rl_src2.low_reg, op == kOpDiv);
-    } else {
+      done = true;
+    } else if (cu_->instruction_set == kThumb2) {
+      if (cu_->GetInstructionSetFeatures().HasDivideInstruction()) {
+        // Use ARM SDIV instruction for division.  For remainder we also need to
+        // calculate using a MUL and subtract.
+        rl_src1 = LoadValue(rl_src1, kCoreReg);
+        rl_src2 = LoadValue(rl_src2, kCoreReg);
+        if (check_zero) {
+            GenImmedCheck(kCondEq, rl_src2.low_reg, 0, kThrowDivZero);
+        }
+        rl_result = GenDivRem(rl_dest, rl_src1.low_reg, rl_src2.low_reg, op == kOpDiv);
+        done = true;
+      }
+    }
+
+    // If we haven't already generated the code use the callout function.
+    if (!done) {
       ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pIdivmod);
       FlushAllRegs();   /* Send everything to home location */
       LoadValueDirectFixed(rl_src2, TargetReg(kArg1));
@@ -1323,7 +1340,7 @@
       if (check_zero) {
         GenImmedCheck(kCondEq, TargetReg(kArg1), 0, kThrowDivZero);
       }
-      // NOTE: callout here is not a safepoint
+      // NOTE: callout here is not a safepoint.
       CallHelper(r_tgt, func_offset, false /* not a safepoint */);
       if (op == kOpDiv)
         rl_result = GetReturn(false);
@@ -1561,11 +1578,24 @@
       if (HandleEasyDivRem(opcode, is_div, rl_src, rl_dest, lit)) {
         return;
       }
+
+      bool done = false;
       if (cu_->instruction_set == kMips) {
         rl_src = LoadValue(rl_src, kCoreReg);
         rl_result = GenDivRemLit(rl_dest, rl_src.low_reg, lit, is_div);
-      } else {
-        FlushAllRegs();   /* Everything to home location */
+        done = true;
+      } else if (cu_->instruction_set == kThumb2) {
+        if (cu_->GetInstructionSetFeatures().HasDivideInstruction()) {
+          // Use ARM SDIV instruction for division.  For remainder we also need to
+          // calculate using a MUL and subtract.
+          rl_src = LoadValue(rl_src, kCoreReg);
+          rl_result = GenDivRemLit(rl_dest, rl_src.low_reg, lit, is_div);
+          done = true;
+        }
+      }
+
+      if (!done) {
+        FlushAllRegs();   /* Everything to home location. */
         LoadValueDirectFixed(rl_src, TargetReg(kArg0));
         Clobber(TargetReg(kArg0));
         ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pIdivmod);
@@ -1583,7 +1613,7 @@
   }
   rl_src = LoadValue(rl_src, kCoreReg);
   rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  // Avoid shifts by literal 0 - no support in Thumb.  Change to copy
+  // Avoid shifts by literal 0 - no support in Thumb.  Change to copy.
   if (shift_op && (lit == 0)) {
     OpRegCopy(rl_result.low_reg, rl_src.low_reg);
   } else {
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 783c322..4871e16 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -336,10 +336,12 @@
                                                std::string const& filename);
 
 CompilerDriver::CompilerDriver(CompilerBackend compiler_backend, InstructionSet instruction_set,
+                               InstructionSetFeatures instruction_set_features,
                                bool image, DescriptorSet* image_classes, size_t thread_count,
                                bool dump_stats)
     : compiler_backend_(compiler_backend),
       instruction_set_(instruction_set),
+      instruction_set_features_(instruction_set_features),
       freezing_constructor_lock_("freezing constructor lock"),
       compiled_classes_lock_("compiled classes lock"),
       compiled_methods_lock_("compiled method lock"),
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 9b9a884..9321f06 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -91,6 +91,7 @@
   // can assume will be in the image, with NULL implying all available
   // classes.
   explicit CompilerDriver(CompilerBackend compiler_backend, InstructionSet instruction_set,
+                          InstructionSetFeatures instruction_set_features,
                           bool image, DescriptorSet* image_classes,
                           size_t thread_count, bool dump_stats);
 
@@ -104,10 +105,14 @@
   void CompileOne(const mirror::ArtMethod* method, base::TimingLogger& timings)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  InstructionSet GetInstructionSet() const {
+  const InstructionSet& GetInstructionSet() const {
     return instruction_set_;
   }
 
+  const InstructionSetFeatures& GetInstructionSetFeatures() const {
+    return instruction_set_features_;
+  }
+
   CompilerBackend GetCompilerBackend() const {
     return compiler_backend_;
   }
@@ -386,7 +391,8 @@
 
   CompilerBackend compiler_backend_;
 
-  InstructionSet instruction_set_;
+  const InstructionSet instruction_set_;
+  const InstructionSetFeatures instruction_set_features_;
 
   // All class references that require
   mutable ReaderWriterMutex freezing_constructor_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index 815bca5..6213b45 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -76,7 +76,10 @@
   CompilerBackend compiler_backend = kQuick;
 #endif
   InstructionSet insn_set = kIsTargetBuild ? kThumb2 : kX86;
-  compiler_driver_.reset(new CompilerDriver(compiler_backend, insn_set, false, NULL, 2, true));
+
+  InstructionSetFeatures insn_features;
+  compiler_driver_.reset(new CompilerDriver(compiler_backend, insn_set,
+                                            insn_features, false, NULL, 2, true));
   jobject class_loader = NULL;
   if (kCompile) {
     base::TimingLogger timings("OatTest::WriteRead", false, false);
@@ -149,17 +152,19 @@
 TEST_F(OatTest, OatHeaderSizeCheck) {
   // If this test is failing and you have to update these constants,
   // it is time to update OatHeader::kOatVersion
-  EXPECT_EQ(72U, sizeof(OatHeader));
+  EXPECT_EQ(76U, sizeof(OatHeader));
   EXPECT_EQ(28U, sizeof(OatMethodOffsets));
 }
 
 TEST_F(OatTest, OatHeaderIsValid) {
     InstructionSet instruction_set = kX86;
+    InstructionSetFeatures instruction_set_features;
     std::vector<const DexFile*> dex_files;
     uint32_t image_file_location_oat_checksum = 0;
     uint32_t image_file_location_oat_begin = 0;
     const std::string image_file_location;
     OatHeader oat_header(instruction_set,
+                         instruction_set_features,
                          &dex_files,
                          image_file_location_oat_checksum,
                          image_file_location_oat_begin,
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 28355bf..f3bb112 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -98,6 +98,7 @@
 size_t OatWriter::InitOatHeader() {
   // create the OatHeader
   oat_header_ = new OatHeader(compiler_driver_->GetInstructionSet(),
+                              compiler_driver_->GetInstructionSetFeatures(),
                               dex_files_,
                               image_file_location_oat_checksum_,
                               image_file_location_oat_begin_,