Opt compiler: Speedup div/rem by constants on arm32 and arm64.

This patch also includes:
1. Add java test for div/rem negative constants.
2. Fix a thumb2 encoding issue where the last operand is
   "reg, shift #amount" in some instructions.
3. Support a simple filter in arm32 assembler test to filter out
   unsupported cases, such as "smull r0, r0, r1, r2".
4. Add smull arm32 assembler test.
5. Add smull/umull thumb2 test.
6. Add test for the thumb2 encoding issue which is fixed in this
   patch.

Change-Id: I1601bc9c38f70f11909f2816fe3ec105a158951e
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index 313f365..dee8287 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -398,6 +398,8 @@
                    Condition cond = AL) = 0;
   virtual void mls(Register rd, Register rn, Register rm, Register ra,
                    Condition cond = AL) = 0;
+  virtual void smull(Register rd_lo, Register rd_hi, Register rn, Register rm,
+                     Condition cond = AL) = 0;
   virtual void umull(Register rd_lo, Register rd_hi, Register rn, Register rm,
                      Condition cond = AL) = 0;
 
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index 9579691..6e165fc 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -200,6 +200,13 @@
 }
 
 
+void Arm32Assembler::smull(Register rd_lo, Register rd_hi, Register rn,
+                           Register rm, Condition cond) {
+  // Assembler registers rd_lo, rd_hi, rn, rm are encoded as rd, rn, rm, rs.
+  EmitMulOp(cond, B23 | B22, rd_lo, rd_hi, rn, rm);
+}
+
+
 void Arm32Assembler::umull(Register rd_lo, Register rd_hi, Register rn,
                            Register rm, Condition cond) {
   // Assembler registers rd_lo, rd_hi, rn, rm are encoded as rd, rn, rm, rs.
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index b922d66..55ec7b4 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -90,6 +90,8 @@
            Condition cond = AL) OVERRIDE;
   void mls(Register rd, Register rn, Register rm, Register ra,
            Condition cond = AL) OVERRIDE;
+  void smull(Register rd_lo, Register rd_hi, Register rn, Register rm,
+             Condition cond = AL) OVERRIDE;
   void umull(Register rd_lo, Register rd_hi, Register rn, Register rm,
              Condition cond = AL) OVERRIDE;
 
diff --git a/compiler/utils/arm/assembler_arm32_test.cc b/compiler/utils/arm/assembler_arm32_test.cc
index 4a0ae0b..efd517b 100644
--- a/compiler/utils/arm/assembler_arm32_test.cc
+++ b/compiler/utils/arm/assembler_arm32_test.cc
@@ -293,12 +293,29 @@
     f();
   }
 
+  // NOTE: Only support simple test like "aaa=bbb"
+  bool EvalFilterString(std::string filter) {
+    if (filter.compare("") == 0) {
+      return false;
+    }
+
+    size_t equal_sign_index = filter.find('=');
+    if (equal_sign_index == std::string::npos) {
+      EXPECT_TRUE(false) << "Unsupported filter string.";
+    }
+
+    std::string lhs = filter.substr(0, equal_sign_index);
+    std::string rhs = filter.substr(equal_sign_index + 1, std::string::npos);
+    return lhs.compare(rhs) == 0;
+  }
+
   void TemplateHelper(std::function<void(arm::Register)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc,
-                      std::string fmt, std::ostringstream& oss) {
+                      bool without_pc, std::string fmt, std::string filter,
+                      std::ostringstream& oss) {
     std::vector<arm::Register*> registers = without_pc ? GetRegistersWithoutPC() : GetRegisters();
     for (auto reg : registers) {
       std::string after_reg = fmt;
+      std::string after_reg_filter = filter;
 
       std::string reg_string = GetRegName<RegisterView::kUsePrimaryName>(*reg);
       size_t reg_index;
@@ -308,14 +325,23 @@
         after_reg.replace(reg_index, strlen(reg_token), reg_string);
       }
 
+      while ((reg_index = after_reg_filter.find(reg_token)) != std::string::npos) {
+        after_reg_filter.replace(reg_index, strlen(reg_token), reg_string);
+      }
+      if (EvalFilterString(after_reg_filter)) {
+        continue;
+      }
+
       ExecuteAndPrint([&] () { f(*reg); }, after_reg, oss);
     }
   }
 
   void TemplateHelper(std::function<void(const arm::ShifterOperand&)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::ostringstream& oss) {
+                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::string filter,
+                      std::ostringstream& oss) {
     for (const arm::ShifterOperand& shift : GetShiftOperands()) {
       std::string after_shift = fmt;
+      std::string after_shift_filter = filter;
 
       std::string shift_string = GetShiftString(shift);
       size_t shift_index;
@@ -323,30 +349,48 @@
         after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
       }
 
+      while ((shift_index = after_shift_filter.find(SHIFT_TOKEN)) != std::string::npos) {
+        after_shift_filter.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
+      }
+      if (EvalFilterString(after_shift_filter)) {
+        continue;
+      }
+
       ExecuteAndPrint([&] () { f(shift); }, after_shift, oss);
     }
   }
 
   void TemplateHelper(std::function<void(arm::Condition)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::ostringstream& oss) {
+                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::string filter,
+                      std::ostringstream& oss) {
     for (arm::Condition c : GetConditions()) {
       std::string after_cond = fmt;
+      std::string after_cond_filter = filter;
 
       size_t cond_index = after_cond.find(COND_TOKEN);
       if (cond_index != std::string::npos) {
         after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
       }
 
+      cond_index = after_cond_filter.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond_filter.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+      if (EvalFilterString(after_cond_filter)) {
+        continue;
+      }
+
       ExecuteAndPrint([&] () { f(c); }, after_cond, oss);
     }
   }
 
   template <typename... Args>
   void TemplateHelper(std::function<void(arm::Register, Args...)> f, int depth, bool without_pc,
-                      std::string fmt, std::ostringstream& oss) {
+                      std::string fmt, std::string filter, std::ostringstream& oss) {
     std::vector<arm::Register*> registers = without_pc ? GetRegistersWithoutPC() : GetRegisters();
     for (auto reg : registers) {
       std::string after_reg = fmt;
+      std::string after_reg_filter = filter;
 
       std::string reg_string = GetRegName<RegisterView::kUsePrimaryName>(*reg);
       size_t reg_index;
@@ -356,17 +400,26 @@
         after_reg.replace(reg_index, strlen(reg_token), reg_string);
       }
 
+      while ((reg_index = after_reg_filter.find(reg_token)) != std::string::npos) {
+        after_reg_filter.replace(reg_index, strlen(reg_token), reg_string);
+      }
+      if (EvalFilterString(after_reg_filter)) {
+        continue;
+      }
+
       auto lambda = [&] (Args... args) { f(*reg, args...); };  // NOLINT [readability/braces] [4]
       TemplateHelper(std::function<void(Args...)>(lambda), depth + 1, without_pc,
-          after_reg, oss);
+          after_reg, after_reg_filter, oss);
     }
   }
 
   template <typename... Args>
   void TemplateHelper(std::function<void(const arm::ShifterOperand&, Args...)> f, int depth,
-                      bool without_pc, std::string fmt, std::ostringstream& oss) {
+                      bool without_pc, std::string fmt, std::string filter,
+                      std::ostringstream& oss) {
     for (const arm::ShifterOperand& shift : GetShiftOperands()) {
       std::string after_shift = fmt;
+      std::string after_shift_filter = filter;
 
       std::string shift_string = GetShiftString(shift);
       size_t shift_index;
@@ -374,26 +427,42 @@
         after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
       }
 
+      while ((shift_index = after_shift_filter.find(SHIFT_TOKEN)) != std::string::npos) {
+        after_shift_filter.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
+      }
+      if (EvalFilterString(after_shift_filter)) {
+        continue;
+      }
+
       auto lambda = [&] (Args... args) { f(shift, args...); };  // NOLINT [readability/braces] [4]
       TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
-          after_shift, oss);
+          after_shift, after_shift_filter, oss);
     }
   }
 
   template <typename... Args>
   void TemplateHelper(std::function<void(arm::Condition, Args...)> f, int depth, bool without_pc,
-                      std::string fmt, std::ostringstream& oss) {
+                      std::string fmt, std::string filter, std::ostringstream& oss) {
     for (arm::Condition c : GetConditions()) {
       std::string after_cond = fmt;
+      std::string after_cond_filter = filter;
 
       size_t cond_index = after_cond.find(COND_TOKEN);
       if (cond_index != std::string::npos) {
         after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
       }
 
+      cond_index = after_cond_filter.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond_filter.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+      if (EvalFilterString(after_cond_filter)) {
+        continue;
+      }
+
       auto lambda = [&] (Args... args) { f(c, args...); };  // NOLINT [readability/braces] [4]
       TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
-          after_cond, oss);
+          after_cond, after_cond_filter, oss);
     }
   }
 
@@ -421,13 +490,13 @@
 
   template <typename... Args>
   void GenericTemplateHelper(std::function<void(Args...)> f, bool without_pc,
-                             std::string fmt, std::string test_name) {
+                             std::string fmt, std::string test_name, std::string filter) {
     first_ = false;
     WarnOnCombinations(CountHelper<Args...>(without_pc));
 
     std::ostringstream oss;
 
-    TemplateHelper(f, 0, without_pc, fmt, oss);
+    TemplateHelper(f, 0, without_pc, fmt, filter, oss);
 
     oss << "\n";  // Trailing newline.
 
@@ -436,26 +505,26 @@
 
   template <typename... Args>
   void T2Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
-                std::string test_name) {
-    GenericTemplateHelper(GetBoundFunction2(f), without_pc, fmt, test_name);
+                std::string test_name, std::string filter = "") {
+    GenericTemplateHelper(GetBoundFunction2(f), without_pc, fmt, test_name, filter);
   }
 
   template <typename... Args>
   void T3Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name) {
-    GenericTemplateHelper(GetBoundFunction3(f), without_pc, fmt, test_name);
+      std::string test_name, std::string filter = "") {
+    GenericTemplateHelper(GetBoundFunction3(f), without_pc, fmt, test_name, filter);
   }
 
   template <typename... Args>
   void T4Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name) {
-    GenericTemplateHelper(GetBoundFunction4(f), without_pc, fmt, test_name);
+      std::string test_name, std::string filter = "") {
+    GenericTemplateHelper(GetBoundFunction4(f), without_pc, fmt, test_name, filter);
   }
 
   template <typename... Args>
   void T5Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name) {
-    GenericTemplateHelper(GetBoundFunction5(f), without_pc, fmt, test_name);
+      std::string test_name, std::string filter = "") {
+    GenericTemplateHelper(GetBoundFunction5(f), without_pc, fmt, test_name, filter);
   }
 
  private:
@@ -565,15 +634,18 @@
 }
 
 TEST_F(AssemblerArm32Test, Mla) {
-  T5Helper(&arm::Arm32Assembler::mla, true, "mla{cond} {reg1}, {reg2}, {reg3}, {reg4}", "mul");
+  T5Helper(&arm::Arm32Assembler::mla, true, "mla{cond} {reg1}, {reg2}, {reg3}, {reg4}", "mla");
 }
 
-/* TODO: Needs support to filter out register combinations, as rdhi must not be equal to rdlo.
 TEST_F(AssemblerArm32Test, Umull) {
   T5Helper(&arm::Arm32Assembler::umull, true, "umull{cond} {reg1}, {reg2}, {reg3}, {reg4}",
-           "umull");
+           "umull", "{reg1}={reg2}");  // Skip the cases where reg1 == reg2.
 }
-*/
+
+TEST_F(AssemblerArm32Test, Smull) {
+  T5Helper(&arm::Arm32Assembler::smull, true, "smull{cond} {reg1}, {reg2}, {reg3}, {reg4}",
+           "smull", "{reg1}={reg2}");  // Skip the cases where reg1 == reg2.
+}
 
 TEST_F(AssemblerArm32Test, Sdiv) {
   T4Helper(&arm::Arm32Assembler::sdiv, true, "sdiv{cond} {reg1}, {reg2}, {reg3}", "sdiv");
@@ -655,9 +727,10 @@
   T4Helper(&arm::Arm32Assembler::rsc, true, "rsc{cond} {reg1}, {reg2}, {shift}", "rsc");
 }
 
-/* TODO: Needs support to filter out register combinations, as reg1 must not be equal to reg3.
+/* TODO: Need better filter support.
 TEST_F(AssemblerArm32Test, Strex) {
-  RRRCWithoutPCHelper(&arm::Arm32Assembler::strex, "strex{cond} {reg1}, {reg2}, [{reg3}]", "strex");
+  T4Helper(&arm::Arm32Assembler::strex, "strex{cond} {reg1}, {reg2}, [{reg3}]", "strex",
+           "{reg1}={reg2}||{reg1}={reg3}");  // Skip the cases where reg1 == reg2 || reg1 == reg3.
 }
 */
 
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 3b42f63..e7cf26e 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -238,6 +238,24 @@
 }
 
 
+void Thumb2Assembler::smull(Register rd_lo, Register rd_hi, Register rn,
+                            Register rm, Condition cond) {
+  CheckCondition(cond);
+
+  uint32_t op1 = 0U /* 0b000; */;
+  uint32_t op2 = 0U /* 0b0000 */;
+  int32_t encoding = B31 | B30 | B29 | B28 | B27 | B25 | B24 | B23 |
+      op1 << 20 |
+      op2 << 4 |
+      static_cast<uint32_t>(rd_lo) << 12 |
+      static_cast<uint32_t>(rd_hi) << 8 |
+      static_cast<uint32_t>(rn) << 16 |
+      static_cast<uint32_t>(rm);
+
+  Emit32(encoding);
+}
+
+
 void Thumb2Assembler::umull(Register rd_lo, Register rd_hi, Register rn,
                             Register rm, Condition cond) {
   CheckCondition(cond);
@@ -740,13 +758,6 @@
     return true;
   }
 
-  // Check for MOV with an ROR.
-  if (opcode == MOV && so.IsRegister() && so.IsShift() && so.GetShift() == ROR) {
-    if (so.GetImmediate() != 0) {
-      return true;
-    }
-  }
-
   bool rn_is_valid = true;
 
   // Check for single operand instructions and ADD/SUB.
@@ -792,6 +803,19 @@
     }
   }
 
+  // Check for register shift operand.
+  if (so.IsRegister() && so.IsShift()) {
+    if (opcode != MOV) {
+      return true;
+    }
+    // Check for MOV with an ROR.
+    if (so.GetShift() == ROR) {
+      if (so.GetImmediate() != 0) {
+        return true;
+      }
+    }
+  }
+
   // The instruction can be encoded in 16 bits.
   return false;
 }
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index e33c240..17eae8b 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -112,6 +112,8 @@
            Condition cond = AL) OVERRIDE;
   void mls(Register rd, Register rn, Register rm, Register ra,
            Condition cond = AL) OVERRIDE;
+  void smull(Register rd_lo, Register rd_hi, Register rn, Register rm,
+             Condition cond = AL) OVERRIDE;
   void umull(Register rd_lo, Register rd_hi, Register rn, Register rm,
              Condition cond = AL) OVERRIDE;
 
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index 5f5561a..733441b 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -89,23 +89,24 @@
   EXPECT_TRUE(CheckTools());
 }
 
+#define __ GetAssembler()->
 
 TEST_F(AssemblerThumb2Test, Sbfx) {
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 16);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 32);
+  __ sbfx(arm::R0, arm::R1, 0, 1);
+  __ sbfx(arm::R0, arm::R1, 0, 8);
+  __ sbfx(arm::R0, arm::R1, 0, 16);
+  __ sbfx(arm::R0, arm::R1, 0, 32);
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 16);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 24);
+  __ sbfx(arm::R0, arm::R1, 8, 1);
+  __ sbfx(arm::R0, arm::R1, 8, 8);
+  __ sbfx(arm::R0, arm::R1, 8, 16);
+  __ sbfx(arm::R0, arm::R1, 8, 24);
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 16);
+  __ sbfx(arm::R0, arm::R1, 16, 1);
+  __ sbfx(arm::R0, arm::R1, 16, 8);
+  __ sbfx(arm::R0, arm::R1, 16, 16);
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 31, 1);
+  __ sbfx(arm::R0, arm::R1, 31, 1);
 
   const char* expected =
       "sbfx r0, r1, #0, #1\n"
@@ -127,21 +128,21 @@
 }
 
 TEST_F(AssemblerThumb2Test, Ubfx) {
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 16);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 32);
+  __ ubfx(arm::R0, arm::R1, 0, 1);
+  __ ubfx(arm::R0, arm::R1, 0, 8);
+  __ ubfx(arm::R0, arm::R1, 0, 16);
+  __ ubfx(arm::R0, arm::R1, 0, 32);
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 16);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 24);
+  __ ubfx(arm::R0, arm::R1, 8, 1);
+  __ ubfx(arm::R0, arm::R1, 8, 8);
+  __ ubfx(arm::R0, arm::R1, 8, 16);
+  __ ubfx(arm::R0, arm::R1, 8, 24);
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 16);
+  __ ubfx(arm::R0, arm::R1, 16, 1);
+  __ ubfx(arm::R0, arm::R1, 16, 8);
+  __ ubfx(arm::R0, arm::R1, 16, 16);
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 31, 1);
+  __ ubfx(arm::R0, arm::R1, 31, 1);
 
   const char* expected =
       "ubfx r0, r1, #0, #1\n"
@@ -163,7 +164,7 @@
 }
 
 TEST_F(AssemblerThumb2Test, Vmstat) {
-  GetAssembler()->vmstat();
+  __ vmstat();
 
   const char* expected = "vmrs APSR_nzcv, FPSCR\n";
 
@@ -171,10 +172,10 @@
 }
 
 TEST_F(AssemblerThumb2Test, ldrexd) {
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R0);
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R1);
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R2);
-  GetAssembler()->ldrexd(arm::R5, arm::R3, arm::R7);
+  __ ldrexd(arm::R0, arm::R1, arm::R0);
+  __ ldrexd(arm::R0, arm::R1, arm::R1);
+  __ ldrexd(arm::R0, arm::R1, arm::R2);
+  __ ldrexd(arm::R5, arm::R3, arm::R7);
 
   const char* expected =
       "ldrexd r0, r1, [r0]\n"
@@ -185,10 +186,10 @@
 }
 
 TEST_F(AssemblerThumb2Test, strexd) {
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R0);
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R1);
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R2);
-  GetAssembler()->strexd(arm::R9, arm::R5, arm::R3, arm::R7);
+  __ strexd(arm::R9, arm::R0, arm::R1, arm::R0);
+  __ strexd(arm::R9, arm::R0, arm::R1, arm::R1);
+  __ strexd(arm::R9, arm::R0, arm::R1, arm::R2);
+  __ strexd(arm::R9, arm::R5, arm::R3, arm::R7);
 
   const char* expected =
       "strexd r9, r0, r1, [r0]\n"
@@ -199,9 +200,9 @@
 }
 
 TEST_F(AssemblerThumb2Test, LdrdStrd) {
-  GetAssembler()->ldrd(arm::R0, arm::Address(arm::R2, 8));
-  GetAssembler()->ldrd(arm::R0, arm::Address(arm::R12));
-  GetAssembler()->strd(arm::R0, arm::Address(arm::R2, 8));
+  __ ldrd(arm::R0, arm::Address(arm::R2, 8));
+  __ ldrd(arm::R0, arm::Address(arm::R12));
+  __ strd(arm::R0, arm::Address(arm::R2, 8));
 
   const char* expected =
       "ldrd r0, r1, [r2, #8]\n"
@@ -211,7 +212,6 @@
 }
 
 TEST_F(AssemblerThumb2Test, eor) {
-#define __ GetAssembler()->
   __ eor(arm::R1, arm::R1, arm::ShifterOperand(arm::R0));
   __ eor(arm::R1, arm::R0, arm::ShifterOperand(arm::R1));
   __ eor(arm::R1, arm::R8, arm::ShifterOperand(arm::R0));
@@ -230,23 +230,47 @@
 TEST_F(AssemblerThumb2Test, sub) {
   __ subs(arm::R1, arm::R0, arm::ShifterOperand(42));
   __ sub(arm::R1, arm::R0, arm::ShifterOperand(42));
+  __ subs(arm::R1, arm::R0, arm::ShifterOperand(arm::R2, arm::ASR, 31));
+  __ sub(arm::R1, arm::R0, arm::ShifterOperand(arm::R2, arm::ASR, 31));
 
   const char* expected =
       "subs r1, r0, #42\n"
-      "subw r1, r0, #42\n";
+      "subw r1, r0, #42\n"
+      "subs r1, r0, r2, asr #31\n"
+      "sub r1, r0, r2, asr #31\n";
   DriverStr(expected, "sub");
 }
 
 TEST_F(AssemblerThumb2Test, add) {
   __ adds(arm::R1, arm::R0, arm::ShifterOperand(42));
   __ add(arm::R1, arm::R0, arm::ShifterOperand(42));
+  __ adds(arm::R1, arm::R0, arm::ShifterOperand(arm::R2, arm::ASR, 31));
+  __ add(arm::R1, arm::R0, arm::ShifterOperand(arm::R2, arm::ASR, 31));
 
   const char* expected =
       "adds r1, r0, #42\n"
-      "addw r1, r0, #42\n";
+      "addw r1, r0, #42\n"
+      "adds r1, r0, r2, asr #31\n"
+      "add r1, r0, r2, asr #31\n";
   DriverStr(expected, "add");
 }
 
+TEST_F(AssemblerThumb2Test, umull) {
+  __ umull(arm::R0, arm::R1, arm::R2, arm::R3);
+
+  const char* expected =
+      "umull r0, r1, r2, r3\n";
+  DriverStr(expected, "umull");
+}
+
+TEST_F(AssemblerThumb2Test, smull) {
+  __ smull(arm::R0, arm::R1, arm::R2, arm::R3);
+
+  const char* expected =
+      "smull r0, r1, r2, r3\n";
+  DriverStr(expected, "smull");
+}
+
 TEST_F(AssemblerThumb2Test, StoreWordToThumbOffset) {
   arm::StoreOperandType type = arm::kStoreWord;
   int32_t offset = 4092;