Optimize rem-int/lit too.

Bryan hitting the bug in my div-int/lit optimization (that caused it to
try to rewrite rem-int/lit too) shows that I was wrong in assuming % wasn't
worth doing because it wouldn't be hot enough.

Before:

                     benchmark  ns logarithmic runtime
       RemainderIntByConstant2  44 XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
    RemainderIntByConstant2048  34 XXXXXXXXXXXXXXXXXXXXXX|||||
       RemainderIntByConstant8  44 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
       RemainderIntByVariable2  40 XXXXXXXXXXXXXXXXXXXXXXXXXXX||

After:

                     benchmark  ns logarithmic runtime
       RemainderIntByConstant2  13 XXXXXXXXX|||||||||||
    RemainderIntByConstant2048  16 XXXXXXXXXXXX||||||||||
       RemainderIntByConstant8  16 XXXXXXXXXXXX||||||||||
       RemainderIntByVariable2  40 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

Bug: 2614702
Change-Id: I719fc8765feececd5b73c3cb2e44dd3cf20c45ce
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index c691d15..68d5b84 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -1903,9 +1903,6 @@
 static bool handleEasyDivide(CompilationUnit *cUnit, OpCode dalvikOpCode,
                              RegLocation rlSrc, RegLocation rlDest, int lit)
 {
-    if (dalvikOpCode != OP_DIV_INT_LIT8 && dalvikOpCode != OP_DIV_INT_LIT16) {
-        return false;
-    }
     if (lit < 2 || !isPowerOfTwo(lit)) {
         return false;
     }
@@ -1914,19 +1911,39 @@
         // Avoid special cases.
         return false;
     }
+    bool div = (dalvikOpCode == OP_DIV_INT_LIT8 || dalvikOpCode == OP_DIV_INT_LIT16);
     rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
     RegLocation rlResult = dvmCompilerEvalLoc(cUnit, rlDest, kCoreReg, true);
-    int tReg = dvmCompilerAllocTemp(cUnit);
-    if (lit == 2) {
-        // Division by 2 is by far the most common division by constant.
-        opRegRegImm(cUnit, kOpLsr, tReg, rlSrc.lowReg, 32 - k);
-        opRegRegReg(cUnit, kOpAdd, tReg, tReg, rlSrc.lowReg);
-        opRegRegImm(cUnit, kOpAsr, rlResult.lowReg, tReg, k);
+    if (div) {
+        int tReg = dvmCompilerAllocTemp(cUnit);
+        if (lit == 2) {
+            // Division by 2 is by far the most common division by constant.
+            opRegRegImm(cUnit, kOpLsr, tReg, rlSrc.lowReg, 32 - k);
+            opRegRegReg(cUnit, kOpAdd, tReg, tReg, rlSrc.lowReg);
+            opRegRegImm(cUnit, kOpAsr, rlResult.lowReg, tReg, k);
+        } else {
+            opRegRegImm(cUnit, kOpAsr, tReg, rlSrc.lowReg, 31);
+            opRegRegImm(cUnit, kOpLsr, tReg, tReg, 32 - k);
+            opRegRegReg(cUnit, kOpAdd, tReg, tReg, rlSrc.lowReg);
+            opRegRegImm(cUnit, kOpAsr, rlResult.lowReg, tReg, k);
+        }
     } else {
-        opRegRegImm(cUnit, kOpAsr, tReg, rlSrc.lowReg, 31);
-        opRegRegImm(cUnit, kOpLsr, tReg, tReg, 32 - k);
-        opRegRegReg(cUnit, kOpAdd, tReg, tReg, rlSrc.lowReg);
-        opRegRegImm(cUnit, kOpAsr, rlResult.lowReg, tReg, k);
+        int cReg = dvmCompilerAllocTemp(cUnit);
+        loadConstant(cUnit, cReg, lit - 1);
+        int tReg1 = dvmCompilerAllocTemp(cUnit);
+        int tReg2 = dvmCompilerAllocTemp(cUnit);
+        if (lit == 2) {
+            opRegRegImm(cUnit, kOpLsr, tReg1, rlSrc.lowReg, 32 - k);
+            opRegRegReg(cUnit, kOpAdd, tReg2, tReg1, rlSrc.lowReg);
+            opRegRegReg(cUnit, kOpAnd, tReg2, tReg2, cReg);
+            opRegRegReg(cUnit, kOpSub, rlResult.lowReg, tReg2, tReg1);
+        } else {
+            opRegRegImm(cUnit, kOpAsr, tReg1, rlSrc.lowReg, 31);
+            opRegRegImm(cUnit, kOpLsr, tReg1, tReg1, 32 - k);
+            opRegRegReg(cUnit, kOpAdd, tReg2, tReg1, rlSrc.lowReg);
+            opRegRegReg(cUnit, kOpAnd, tReg2, tReg2, cReg);
+            opRegRegReg(cUnit, kOpSub, rlResult.lowReg, tReg2, tReg1);
+        }
     }
     storeValue(cUnit, rlDest, rlResult);
     return true;