Inline memove for small constant sizes and refactor memcpy and memset.

The memory intrinsics are only optimized at -O1 and higher unless the
-fmem-intrin-opt flag is set to force to optimization to take place.

This change also introduces the xchg instruction for two register operands. This
is no longer used in the memory intrinsic lowering (or by anything else) but the
implementation is left for future use.

BUG=
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/1278173009.
diff --git a/Makefile.standalone b/Makefile.standalone
index d53332f..18ecc9a 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -227,6 +227,9 @@
 # The X86 assembler tests take too long to compile. Given how infrequently the
 # assembler will change, we disable them.
 ifdef CHECK_X86_ASM
+ifndef DEBUG
+$(error Run check-unit with DEBUG=1 lest your machine perish)
+endif
   UNITTEST_SRCS += AssemblerX8632/LowLevel.cpp \
         AssemblerX8632/DataMov.cpp \
         AssemblerX8632/Locked.cpp \
@@ -350,11 +353,14 @@
        # Do all native/sse2 tests, but only test_vector_ops for native/sse4.1.
        # For (slow) sandboxed tests, limit to Om1/sse4.1.
        # TODO(jpp): implement x8664 sandbox, then enable xtests.
+			 # TODO(jpp): reenable the x86-64 tests.
 	./pydir/crosstest_generator.py -v --lit \
 	  --toolchain-root $(TOOLCHAIN_ROOT) \
-	  -i x8632,native,sse2 -i x8632,native,sse4.1,test_vector_ops \
+	  -i x8632,native,sse2 \
+	  -i x8632,native,sse4.1,test_vector_ops \
 	  -i x8632,sandbox,sse4.1,Om1 \
-	  -i x8664,native,sse2 -i x8664,native,sse4.1,test_vector_ops \
+	  -e x8664,native,sse2 \
+	  -e x8664,native,sse4.1,test_vector_ops \
 	  -e x8664,native,sse2,test_global \
 	  -i arm32,native,neon,simple_loop \
 	  -i arm32,native,neon,mem_intrin \
diff --git a/crosstest/crosstest.cfg b/crosstest/crosstest.cfg
index 620edb6..c92963c 100644
--- a/crosstest/crosstest.cfg
+++ b/crosstest/crosstest.cfg
@@ -5,6 +5,7 @@
 [mem_intrin]
 driver: mem_intrin_main.cpp
 test: mem_intrin.cpp
+flags: --sz=-fmem-intrin-opt
 
 [test_arith]
 driver: test_arith_main.cpp
diff --git a/crosstest/mem_intrin.cpp b/crosstest/mem_intrin.cpp
index 0fe0387..b84cdb9 100644
--- a/crosstest/mem_intrin.cpp
+++ b/crosstest/mem_intrin.cpp
@@ -40,30 +40,6 @@
   return (sum_of_sums << 8) | sum;
 }
 
-#define NWORDS 32
-#define BYTE_LENGTH (NWORDS * sizeof(elem_t))
-
-int memcpy_test_fixed_len(uint8_t init) {
-  elem_t buf[NWORDS];
-  elem_t buf2[NWORDS];
-  reset_buf((uint8_t *)buf, init, BYTE_LENGTH);
-  memcpy((void *)buf2, (void *)buf, BYTE_LENGTH);
-  return fletcher_checksum((uint8_t *)buf2, BYTE_LENGTH);
-}
-
-int memmove_test_fixed_len(uint8_t init) {
-  elem_t buf[NWORDS];
-  reset_buf((uint8_t *)buf, init, BYTE_LENGTH);
-  memmove((void *)(buf + 4), (void *)buf, BYTE_LENGTH - (4 * sizeof(elem_t)));
-  return fletcher_checksum((uint8_t *)buf + 4, BYTE_LENGTH - 4);
-}
-
-int memset_test_fixed_len(uint8_t init) {
-  elem_t buf[NWORDS];
-  memset((void *)buf, init, BYTE_LENGTH);
-  return fletcher_checksum((uint8_t *)buf, BYTE_LENGTH);
-}
-
 int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length) {
   reset_buf(buf, init, length);
   memcpy((void *)buf2, (void *)buf, length);
@@ -94,3 +70,33 @@
   memset((void *)buf2, init + 4, length);
   return fletcher_checksum(buf, length) + fletcher_checksum(buf2, length);
 }
+
+#define X(NBYTES)                                                              \
+  int memcpy_test_fixed_len_##NBYTES(uint8_t init) {                           \
+    uint8_t buf[NBYTES];                                                       \
+    uint8_t buf2[NBYTES];                                                      \
+    reset_buf(buf, init, NBYTES);                                              \
+    memcpy((void *)buf2, (void *)buf, NBYTES);                                 \
+    return fletcher_checksum(buf2, NBYTES);                                    \
+  }                                                                            \
+                                                                               \
+  int memmove_test_fixed_len_##NBYTES(uint8_t init) {                          \
+    uint8_t buf[NBYTES + 16];                                                  \
+    uint8_t buf2[NBYTES + 16];                                                 \
+    reset_buf(buf, init, NBYTES + 16);                                         \
+    reset_buf(buf2, init, NBYTES + 16);                                        \
+    /* Move up */                                                              \
+    memmove((void *)(buf + 16), (void *)buf, NBYTES);                          \
+    /* Move down */                                                            \
+    memmove((void *)buf2, (void *)(buf2 + 16), NBYTES);                        \
+    return fletcher_checksum(buf, NBYTES + 16) +                               \
+           fletcher_checksum(buf2, NBYTES + 16);                               \
+  }                                                                            \
+                                                                               \
+  int memset_test_fixed_len_##NBYTES(uint8_t init) {                           \
+    uint8_t buf[NBYTES];                                                       \
+    memset((void *)buf, init, NBYTES);                                         \
+    return fletcher_checksum(buf, NBYTES);                                     \
+  }
+MEMINTRIN_SIZE_TABLE
+#undef X
diff --git a/crosstest/mem_intrin.def b/crosstest/mem_intrin.def
new file mode 100644
index 0000000..16484ad
--- /dev/null
+++ b/crosstest/mem_intrin.def
@@ -0,0 +1,258 @@
+#define MEMINTRIN_SIZE_TABLE \
+  X(0)                       \
+  X(1)                       \
+  X(2)                       \
+  X(3)                       \
+  X(4)                       \
+  X(5)                       \
+  X(6)                       \
+  X(7)                       \
+  X(8)                       \
+  X(9)                       \
+  X(10)                      \
+  X(11)                      \
+  X(12)                      \
+  X(13)                      \
+  X(14)                      \
+  X(15)                      \
+  X(16)                      \
+  X(17)                      \
+  X(18)                      \
+  X(19)                      \
+  X(20)                      \
+  X(21)                      \
+  X(22)                      \
+  X(23)                      \
+  X(24)                      \
+  X(25)                      \
+  X(26)                      \
+  X(27)                      \
+  X(28)                      \
+  X(29)                      \
+  X(30)                      \
+  X(31)                      \
+  X(32)                      \
+  X(33)                      \
+  X(34)                      \
+  X(35)                      \
+  X(36)                      \
+  X(37)                      \
+  X(38)                      \
+  X(39)                      \
+  X(40)                      \
+  X(41)                      \
+  X(42)                      \
+  X(43)                      \
+  X(44)                      \
+  X(45)                      \
+  X(46)                      \
+  X(47)                      \
+  X(48)                      \
+  X(49)                      \
+  X(50)                      \
+  X(51)                      \
+  X(52)                      \
+  X(53)                      \
+  X(54)                      \
+  X(55)                      \
+  X(56)                      \
+  X(57)                      \
+  X(58)                      \
+  X(59)                      \
+  X(60)                      \
+  X(61)                      \
+  X(62)                      \
+  X(63)                      \
+  X(64)                      \
+  X(65)                      \
+  X(66)                      \
+  X(67)                      \
+  X(68)                      \
+  X(69)                      \
+  X(70)                      \
+  X(71)                      \
+  X(72)                      \
+  X(73)                      \
+  X(74)                      \
+  X(75)                      \
+  X(76)                      \
+  X(77)                      \
+  X(78)                      \
+  X(79)                      \
+  X(80)                      \
+  X(81)                      \
+  X(82)                      \
+  X(83)                      \
+  X(84)                      \
+  X(85)                      \
+  X(86)                      \
+  X(87)                      \
+  X(88)                      \
+  X(89)                      \
+  X(90)                      \
+  X(91)                      \
+  X(92)                      \
+  X(93)                      \
+  X(94)                      \
+  X(95)                      \
+  X(96)                      \
+  X(97)                      \
+  X(98)                      \
+  X(99)                      \
+  X(100)                     \
+  X(101)                     \
+  X(102)                     \
+  X(103)                     \
+  X(104)                     \
+  X(105)                     \
+  X(106)                     \
+  X(107)                     \
+  X(108)                     \
+  X(109)                     \
+  X(110)                     \
+  X(111)                     \
+  X(112)                     \
+  X(113)                     \
+  X(114)                     \
+  X(115)                     \
+  X(116)                     \
+  X(117)                     \
+  X(118)                     \
+  X(119)                     \
+  X(120)                     \
+  X(121)                     \
+  X(122)                     \
+  X(123)                     \
+  X(124)                     \
+  X(125)                     \
+  X(126)                     \
+  X(127)                     \
+  X(128)                     \
+  X(129)                     \
+  X(130)                     \
+  X(131)                     \
+  X(132)                     \
+  X(133)                     \
+  X(134)                     \
+  X(135)                     \
+  X(136)                     \
+  X(137)                     \
+  X(138)                     \
+  X(139)                     \
+  X(140)                     \
+  X(141)                     \
+  X(142)                     \
+  X(143)                     \
+  X(144)                     \
+  X(145)                     \
+  X(146)                     \
+  X(147)                     \
+  X(148)                     \
+  X(149)                     \
+  X(150)                     \
+  X(151)                     \
+  X(152)                     \
+  X(153)                     \
+  X(154)                     \
+  X(155)                     \
+  X(156)                     \
+  X(157)                     \
+  X(158)                     \
+  X(159)                     \
+  X(160)                     \
+  X(161)                     \
+  X(162)                     \
+  X(163)                     \
+  X(164)                     \
+  X(165)                     \
+  X(166)                     \
+  X(167)                     \
+  X(168)                     \
+  X(169)                     \
+  X(170)                     \
+  X(171)                     \
+  X(172)                     \
+  X(173)                     \
+  X(174)                     \
+  X(175)                     \
+  X(176)                     \
+  X(177)                     \
+  X(178)                     \
+  X(179)                     \
+  X(180)                     \
+  X(181)                     \
+  X(182)                     \
+  X(183)                     \
+  X(184)                     \
+  X(185)                     \
+  X(186)                     \
+  X(187)                     \
+  X(188)                     \
+  X(189)                     \
+  X(190)                     \
+  X(191)                     \
+  X(192)                     \
+  X(193)                     \
+  X(194)                     \
+  X(195)                     \
+  X(196)                     \
+  X(197)                     \
+  X(198)                     \
+  X(199)                     \
+  X(200)                     \
+  X(201)                     \
+  X(202)                     \
+  X(203)                     \
+  X(204)                     \
+  X(205)                     \
+  X(206)                     \
+  X(207)                     \
+  X(208)                     \
+  X(209)                     \
+  X(210)                     \
+  X(211)                     \
+  X(212)                     \
+  X(213)                     \
+  X(214)                     \
+  X(215)                     \
+  X(216)                     \
+  X(217)                     \
+  X(218)                     \
+  X(219)                     \
+  X(220)                     \
+  X(221)                     \
+  X(222)                     \
+  X(223)                     \
+  X(224)                     \
+  X(225)                     \
+  X(226)                     \
+  X(227)                     \
+  X(228)                     \
+  X(229)                     \
+  X(230)                     \
+  X(231)                     \
+  X(232)                     \
+  X(233)                     \
+  X(234)                     \
+  X(235)                     \
+  X(236)                     \
+  X(237)                     \
+  X(238)                     \
+  X(239)                     \
+  X(240)                     \
+  X(241)                     \
+  X(242)                     \
+  X(243)                     \
+  X(244)                     \
+  X(245)                     \
+  X(246)                     \
+  X(247)                     \
+  X(248)                     \
+  X(249)                     \
+  X(250)                     \
+  X(251)                     \
+  X(252)                     \
+  X(253)                     \
+  X(254)                     \
+  X(255)                     \
+  X(256)
diff --git a/crosstest/mem_intrin.h b/crosstest/mem_intrin.h
index f04e1b2..8b13fa2 100644
--- a/crosstest/mem_intrin.h
+++ b/crosstest/mem_intrin.h
@@ -6,10 +6,15 @@
  */
 #include "xdefs.h"
 
+#include "mem_intrin.def"
+
 int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
 int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
 int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
 
-int memcpy_test_fixed_len(uint8_t init);
-int memmove_test_fixed_len(uint8_t init);
-int memset_test_fixed_len(uint8_t init);
+#define X(NBYTES)                                                              \
+  int memcpy_test_fixed_len_##NBYTES(uint8_t init);                            \
+  int memmove_test_fixed_len_##NBYTES(uint8_t init);                           \
+  int memset_test_fixed_len_##NBYTES(uint8_t init);
+MEMINTRIN_SIZE_TABLE
+#undef X
diff --git a/crosstest/mem_intrin_main.cpp b/crosstest/mem_intrin_main.cpp
index e1102ec..3b5135d 100644
--- a/crosstest/mem_intrin_main.cpp
+++ b/crosstest/mem_intrin_main.cpp
@@ -14,27 +14,6 @@
 #define XSTR(s) STR(s)
 #define STR(s) #s
 
-void testFixedLen(SizeT &TotalTests, SizeT &Passes, SizeT &Failures) {
-#define do_test_fixed(test_func)                                               \
-  for (uint8_t init_val = 0; init_val < 100; ++init_val) {                     \
-    ++TotalTests;                                                              \
-    int llc_result = test_func(init_val);                                      \
-    int sz_result = Subzero_::test_func(init_val);                             \
-    if (llc_result == sz_result) {                                             \
-      ++Passes;                                                                \
-    } else {                                                                   \
-      ++Failures;                                                              \
-      printf("Failure (%s): init_val=%d, llc=%d, sz=%d\n", STR(test_func),     \
-             init_val, llc_result, sz_result);                                 \
-    }                                                                          \
-  }
-
-  do_test_fixed(memcpy_test_fixed_len);
-  do_test_fixed(memmove_test_fixed_len);
-  do_test_fixed(memset_test_fixed_len)
-#undef do_test_fixed
-}
-
 void testVariableLen(SizeT &TotalTests, SizeT &Passes, SizeT &Failures) {
   uint8_t buf[256];
   uint8_t buf2[256];
@@ -60,6 +39,30 @@
 #undef do_test_variable
 }
 
+void testFixedLen(SizeT &TotalTests, SizeT &Passes, SizeT &Failures) {
+#define do_test_fixed(test_func, NBYTES)                                       \
+  for (uint8_t init_val = 0; init_val < 100; ++init_val) {                     \
+    ++TotalTests;                                                              \
+    int llc_result = test_func##_##NBYTES(init_val);                           \
+    int sz_result = Subzero_::test_func##_##NBYTES(init_val);                  \
+    if (llc_result == sz_result) {                                             \
+      ++Passes;                                                                \
+    } else {                                                                   \
+      ++Failures;                                                              \
+      printf("Failure (%s): init_val=%d, len=%d, llc=%d, sz=%d\n",             \
+             STR(test_func), init_val, NBYTES, llc_result, sz_result);         \
+    }                                                                          \
+  }
+
+#define X(NBYTES)                                                              \
+  do_test_fixed(memcpy_test_fixed_len, NBYTES);                                \
+  do_test_fixed(memmove_test_fixed_len, NBYTES);                               \
+  do_test_fixed(memset_test_fixed_len, NBYTES);
+  MEMINTRIN_SIZE_TABLE
+#undef X
+#undef do_test_fixed
+}
+
 #ifdef X8664_STACK_HACK
 extern "C" int wrapped_main(int argc, char *argv[]) {
 #else  // !defined(X8664_STACK_HACK)
diff --git a/pydir/crosstest.py b/pydir/crosstest.py
index d5f240a..4846815 100755
--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -93,6 +93,8 @@
     argparser.add_argument('--filetype', default='obj', dest='filetype',
                            choices=['obj', 'asm', 'iasm'],
                            help='Output file type.  Default %(default)s.')
+    argparser.add_argument('--sz', dest='sz_args', action='append', default=[],
+                           help='Extra arguments to pass to pnacl-sz.')
     args = argparser.parse_args()
 
     nacl_root = FindBaseNaCl()
@@ -133,6 +135,7 @@
         obj_sz = os.path.join(args.dir, base_sz + '.sz.o')
         obj_llc = os.path.join(args.dir, base_sz + '.llc.o')
         shellcmd(['{path}/pnacl-sz'.format(path=os.path.dirname(mypath)),
+                  ] + args.sz_args + [
                   '-O' + args.optlevel,
                   '-mattr=' + args.attr,
                   '--target=' + args.target,
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index c34b776..ba80fd8 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -845,6 +845,8 @@
   void cmpxchg8b(const typename Traits::Address &address, bool Locked);
   void xadd(Type Ty, const typename Traits::Address &address,
             typename Traits::GPRRegister reg, bool Locked);
+  void xchg(Type Ty, typename Traits::GPRRegister reg0,
+            typename Traits::GPRRegister reg1);
   void xchg(Type Ty, const typename Traits::Address &address,
             typename Traits::GPRRegister reg);
 
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 2cb039a..ad5d13a 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -3150,6 +3150,29 @@
 }
 
 template <class Machine>
+void AssemblerX86Base<Machine>::xchg(Type Ty, typename Traits::GPRRegister reg0,
+                                     typename Traits::GPRRegister reg1) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  if (Ty == IceType_i16)
+    emitOperandSizeOverride();
+  // Use short form if either register is EAX.
+  if (reg0 == Traits::Encoded_Reg_Accumulator) {
+    emitRexB(Ty, reg1);
+    emitUint8(0x90 + gprEncoding(reg1));
+  } else if (reg1 == Traits::Encoded_Reg_Accumulator) {
+    emitRexB(Ty, reg0);
+    emitUint8(0x90 + gprEncoding(reg0));
+  } else {
+    emitRexRB(Ty, reg0, reg1);
+    if (isByteSizedArithType(Ty))
+      emitUint8(0x86);
+    else
+      emitUint8(0x87);
+    emitRegisterOperand(gprEncoding(reg0), gprEncoding(reg1));
+  }
+}
+
+template <class Machine>
 void AssemblerX86Base<Machine>::xchg(Type Ty,
                                      const typename Traits::Address &addr,
                                      typename Traits::GPRRegister reg) {
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index ec7f3bb..fea785d 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -83,6 +83,10 @@
     cl::init(false));
 
 cl::opt<bool>
+    ForceMemIntrinOpt("fmem-intrin-opt",
+                      cl::desc("Force optimization of memory intrinsics."));
+
+cl::opt<bool>
     FunctionSections("ffunction-sections",
                      cl::desc("Emit functions into separate sections"));
 
@@ -356,6 +360,7 @@
   OutFlags.DisableTranslation = false;
   OutFlags.DumpStats = false;
   OutFlags.EnableBlockProfile = false;
+  OutFlags.ForceMemIntrinOpt = false;
   OutFlags.FunctionSections = false;
   OutFlags.GenerateUnitTestMessages = false;
   OutFlags.PhiEdgeSplit = false;
@@ -416,6 +421,7 @@
   OutFlags.setDisableTranslation(::DisableTranslation);
   OutFlags.setDumpStats(::DumpStats);
   OutFlags.setEnableBlockProfile(::EnableBlockProfile);
+  OutFlags.setForceMemIntrinOpt(::ForceMemIntrinOpt);
   OutFlags.setFunctionSections(::FunctionSections);
   OutFlags.setNumTranslationThreads(::NumThreads);
   OutFlags.setOptLevel(::OLevel);
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 2ce3b4e..f71d8f4 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -72,6 +72,9 @@
   bool getEnableBlockProfile() const { return EnableBlockProfile; }
   void setEnableBlockProfile(bool NewValue) { EnableBlockProfile = NewValue; }
 
+  bool getForceMemIntrinOpt() const { return ForceMemIntrinOpt; }
+  void setForceMemIntrinOpt(bool NewValue) { ForceMemIntrinOpt = NewValue; }
+
   bool getFunctionSections() const { return FunctionSections; }
   void setFunctionSections(bool NewValue) { FunctionSections = NewValue; }
 
@@ -241,6 +244,7 @@
   bool DisableTranslation;
   bool DumpStats;
   bool EnableBlockProfile;
+  bool ForceMemIntrinOpt;
   bool FunctionSections;
   bool GenerateUnitTestMessages;
   bool PhiEdgeSplit;
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 4d26210..336e268 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -3202,19 +3202,29 @@
   typename InstX86Base<Machine>::Traits::Assembler *Asm =
       Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
   Type Ty = this->getSrc(0)->getType();
-  const auto Mem =
+  const auto *VarReg1 = llvm::cast<Variable>(this->getSrc(1));
+  assert(VarReg1->hasReg());
+  const typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister Reg1 =
+      InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
+          VarReg1->getRegNum());
+
+  if (const auto *VarReg0 = llvm::dyn_cast<Variable>(this->getSrc(0))) {
+    assert(VarReg0->hasReg());
+    const typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister Reg0 =
+        InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
+            VarReg0->getRegNum());
+    Asm->xchg(Ty, Reg0, Reg1);
+    return;
+  }
+
+  const auto *Mem =
       llvm::cast<typename InstX86Base<Machine>::Traits::X86OperandMem>(
           this->getSrc(0));
   assert(Mem->getSegmentRegister() ==
          InstX86Base<Machine>::Traits::X86OperandMem::DefaultSegment);
   const typename InstX86Base<Machine>::Traits::Address Addr =
       Mem->toAsmAddress(Asm);
-  const auto VarReg = llvm::cast<Variable>(this->getSrc(1));
-  assert(VarReg->hasReg());
-  const typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister Reg =
-      InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
-          VarReg->getRegNum());
-  Asm->xchg(Ty, Addr, Reg);
+  Asm->xchg(Ty, Addr, Reg1);
 }
 
 template <class Machine>
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index d1ca570..3e93b76 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -440,6 +440,11 @@
   return Call;
 }
 
+bool TargetLowering::shouldOptimizeMemIntrins() {
+  return Ctx->getFlags().getOptLevel() >= Opt_1 ||
+         Ctx->getFlags().getForceMemIntrinOpt();
+}
+
 void TargetLowering::emitWithoutPrefix(const ConstantRelocatable *C) const {
   if (!BuildDefs::dump())
     return;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 2d03a76..a5e6064 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -351,6 +351,8 @@
     Context.getLastInserted()->setDestNonKillable();
   }
 
+  bool shouldOptimizeMemIntrins();
+
   Cfg *Func;
   GlobalContext *Ctx;
   bool HasComputedFrame = false;
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index e0acbd6..1d47d3a 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -441,6 +441,13 @@
   /// The number of different NOP instructions
   static const uint32_t X86_NUM_NOP_VARIANTS = 5;
 
+  /// \name Limits for unrolling memory intrinsics.
+  /// @{
+  static constexpr uint32_t MEMCPY_UNROLL_LIMIT = 8;
+  static constexpr uint32_t MEMMOVE_UNROLL_LIMIT = 8;
+  static constexpr uint32_t MEMSET_UNROLL_LIMIT = 16;
+  /// @}
+
   /// Value is in bytes. Return Value adjusted to the next highest multiple
   /// of the stack alignment.
   static uint32_t applyStackAlignment(uint32_t Value) {
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index 4a12004..7cb85e0 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -456,6 +456,13 @@
   /// The number of different NOP instructions
   static const uint32_t X86_NUM_NOP_VARIANTS = 5;
 
+  /// \name Limits for unrolling memory intrinsics.
+  /// @{
+  static constexpr uint32_t MEMCPY_UNROLL_LIMIT = 8;
+  static constexpr uint32_t MEMMOVE_UNROLL_LIMIT = 8;
+  static constexpr uint32_t MEMSET_UNROLL_LIMIT = 16;
+  /// @}
+
   /// Value is in bytes. Return Value adjusted to the next highest multiple
   /// of the stack alignment.
   static uint32_t applyStackAlignment(uint32_t Value) {
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index da863f4..31bc35b 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -192,9 +192,17 @@
                       Operand *Val);
   void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
                        Operand *SecondVal);
-  /// Replace a call to memcpy with inline instructions.
+  /// Load from memory for a given type.
+  void typedLoad(Type Ty, Variable *Dest, Variable *Base, Constant *Offset);
+  /// Store to memory for a given type.
+  void typedStore(Type Ty, Variable *Value, Variable *Base, Constant *Offset);
+  /// Copy memory of given type from Src to Dest using OffsetAmt on both.
+  void copyMemory(Type Ty, Variable *Dest, Variable *Src, int32_t OffsetAmt);
+  /// Replace some calls to memcpy with inline instructions.
   void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
-  /// Replace a call to memset with inline instructions.
+  /// Replace some calls to memmove with inline instructions.
+  void lowerMemmove(Operand *Dest, Operand *Src, Operand *Count);
+  /// Replace some calls to memset with inline instructions.
   void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
 
   /// Lower an indirect jump adding sandboxing when needed.
@@ -251,6 +259,19 @@
   Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
   static Type stackSlotType();
 
+  static constexpr uint32_t NoSizeLimit = 0;
+  static const Type TypeForSize[];
+  /// Returns the largest type which is equal to or larger than Size bytes. The
+  /// type is suitable for copying memory i.e. a load and store will be a
+  /// single instruction (for example x86 will get f64 not i64).
+  static Type largestTypeInSize(uint32_t Size, uint32_t MaxSize = NoSizeLimit);
+  /// Returns the smallest type which is equal to or larger than Size bytes. If
+  /// one doesn't exist then the largest type smaller than Size bytes is
+  /// returned. The type is suitable for memory copies as described at
+  /// largestTypeInSize.
+  static Type firstTypeThatFitsSize(uint32_t Size,
+                                    uint32_t MaxSize = NoSizeLimit);
+
   Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
 
   /// \name Returns a vector in a register with the given constant entries.
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index e190b5d..5e46c98 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -3153,11 +3153,7 @@
     return;
   }
   case Intrinsics::Memmove: {
-    InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(Instr->getArg(1));
-    Call->addArg(Instr->getArg(2));
-    lowerCall(Call);
+    lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
     return;
   }
   case Intrinsics::Memset: {
@@ -3600,22 +3596,55 @@
 }
 
 template <class Machine>
+void TargetX86Base<Machine>::typedLoad(Type Ty, Variable *Dest, Variable *Base,
+                                       Constant *Offset) {
+  auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
+
+  if (isVectorType(Ty))
+    _movp(Dest, Mem);
+  else if (Ty == IceType_f64)
+    _movq(Dest, Mem);
+  else
+    _mov(Dest, Mem);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value,
+                                        Variable *Base, Constant *Offset) {
+  auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
+
+  if (isVectorType(Ty))
+    _storep(Value, Mem);
+  else if (Ty == IceType_f64)
+    _storeq(Value, Mem);
+  else
+    _store(Value, Mem);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::copyMemory(Type Ty, Variable *Dest, Variable *Src,
+                                        int32_t OffsetAmt) {
+  Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
+  // TODO(ascull): this or add nullptr test to _movp, _movq
+  Variable *Data = makeReg(Ty);
+
+  typedLoad(Ty, Data, Src, Offset);
+  typedStore(Ty, Data, Dest, Offset);
+}
+
+template <class Machine>
 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
                                          Operand *Count) {
   // There is a load and store for each chunk in the unroll
-  constexpr uint32_t UNROLL_LIMIT = 8;
   constexpr uint32_t BytesPerStorep = 16;
-  constexpr uint32_t BytesPerStoreq = 8;
-  constexpr uint32_t BytesPerStorei32 = 4;
-  constexpr uint32_t BytesPerStorei16 = 2;
-  constexpr uint32_t BytesPerStorei8 = 1;
 
   // Check if the operands are constants
   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
   const bool IsCountConst = CountConst != nullptr;
   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
 
-  if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {
+  if (shouldOptimizeMemIntrins() && IsCountConst &&
+      CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
     // Unlikely, but nothing to do if it does happen
     if (CountValue == 0)
       return;
@@ -3623,80 +3652,29 @@
     Variable *SrcBase = legalizeToReg(Src);
     Variable *DestBase = legalizeToReg(Dest);
 
-    auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {
-      Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
-      // TODO(ascull): this or add nullptr test to _movp, _movq
-      Variable *Data = makeReg(Ty);
+    // Find the largest type that can be used and use it as much as possible in
+    // reverse order. Then handle any remainder with overlapping copies. Since
+    // the remainder will be at the end, there will be reduced pressure on the
+    // memory unit as the accesses to the same memory are far apart.
+    Type Ty = largestTypeInSize(CountValue);
+    uint32_t TyWidth = typeWidthInBytes(Ty);
 
-      // TODO(ascull): is 64-bit better with vector or scalar movq?
-      auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);
-      if (isVectorType(Ty))
-        _movp(Data, SrcMem);
-      else if (Ty == IceType_f64)
-        _movq(Data, SrcMem);
-      else
-        _mov(Data, SrcMem);
-
-      auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
-      if (isVectorType(Ty))
-        _storep(Data, DestMem);
-      else if (Ty == IceType_f64)
-        _storeq(Data, DestMem);
-      else
-        _store(Data, DestMem);
-    };
-
-    // Lowers the assignment to the remaining bytes. Assumes the original size
-    // was large enough to allow for overlaps.
-    auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
-      if (Size > BytesPerStoreq) {
-        lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
-      } else if (Size > BytesPerStorei32) {
-        lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
-      } else if (Size > BytesPerStorei16) {
-        lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
-      } else if (Size > BytesPerStorei8) {
-        lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
-      } else if (Size == BytesPerStorei8) {
-        lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
-      }
-    };
-
-    if (CountValue >= BytesPerStorep) {
-      // Use large vector operations
-      for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
-        N -= BytesPerStorep;
-        lowerCopy(IceType_v16i8, N);
-      }
-      lowerLeftOvers(CountValue & 0xF);
-      return;
+    uint32_t RemainingBytes = CountValue;
+    int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+    while (RemainingBytes >= TyWidth) {
+      copyMemory(Ty, DestBase, SrcBase, Offset);
+      RemainingBytes -= TyWidth;
+      Offset -= TyWidth;
     }
 
-    // Too small to use large vector operations so use small ones instead
-    if (CountValue >= BytesPerStoreq) {
-      lowerCopy(IceType_f64, 0);
-      lowerLeftOvers(CountValue - BytesPerStoreq);
+    if (RemainingBytes == 0)
       return;
-    }
 
-    // Too small for vector operations so use scalar ones
-    if (CountValue >= BytesPerStorei32) {
-      lowerCopy(IceType_i32, 0);
-      lowerLeftOvers(CountValue - BytesPerStorei32);
-      return;
-    }
-
-    // 3 is the awkward size as it is too small for the vector or 32-bit
-    // operations and will not work with lowerLeftOvers as there is no valid
-    // overlap.
-    if (CountValue == 3) {
-      lowerCopy(IceType_i16, 0);
-      lowerCopy(IceType_i8, 2);
-      return;
-    }
-
-    // 1 or 2 can be done in a single scalar copy
-    lowerLeftOvers(CountValue);
+    // Lower the remaining bytes. Adjust to larger types in order to make use
+    // of overlaps in the copies.
+    Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+    Offset = CountValue - typeWidthInBytes(LeftOverTy);
+    copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
     return;
   }
 
@@ -3709,14 +3687,83 @@
 }
 
 template <class Machine>
+void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src,
+                                          Operand *Count) {
+  // There is a load and store for each chunk in the unroll
+  constexpr uint32_t BytesPerStorep = 16;
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const bool IsCountConst = CountConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+
+  if (shouldOptimizeMemIntrins() && IsCountConst &&
+      CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
+    // Unlikely, but nothing to do if it does happen
+    if (CountValue == 0)
+      return;
+
+    Variable *SrcBase = legalizeToReg(Src);
+    Variable *DestBase = legalizeToReg(Dest);
+
+    std::tuple<Type, Constant *, Variable *>
+        Moves[Traits::MEMMOVE_UNROLL_LIMIT];
+    Constant *Offset;
+    Variable *Reg;
+
+    // Copy the data into registers as the source and destination could overlap
+    // so make sure not to clobber the memory. This also means overlapping moves
+    // can be used as we are taking a safe snapshot of the memory.
+    Type Ty = largestTypeInSize(CountValue);
+    uint32_t TyWidth = typeWidthInBytes(Ty);
+
+    uint32_t RemainingBytes = CountValue;
+    int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
+    size_t N = 0;
+    while (RemainingBytes >= TyWidth) {
+      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
+      Offset = Ctx->getConstantInt32(OffsetAmt);
+      Reg = makeReg(Ty);
+      typedLoad(Ty, Reg, SrcBase, Offset);
+      RemainingBytes -= TyWidth;
+      OffsetAmt -= TyWidth;
+      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
+    }
+
+    if (RemainingBytes != 0) {
+      // Lower the remaining bytes. Adjust to larger types in order to make use
+      // of overlaps in the copies.
+      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
+      Ty = firstTypeThatFitsSize(RemainingBytes);
+      Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
+      Reg = makeReg(Ty);
+      typedLoad(Ty, Reg, SrcBase, Offset);
+      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
+    }
+
+    // Copy the data out into the destination memory
+    for (size_t i = 0; i < N; ++i) {
+      std::tie(Ty, Offset, Reg) = Moves[i];
+      typedStore(Ty, Reg, DestBase, Offset);
+    }
+
+    return;
+  }
+
+  // Fall back on a function call
+  InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(Src);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+template <class Machine>
 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
                                          Operand *Count) {
-  constexpr uint32_t UNROLL_LIMIT = 16;
   constexpr uint32_t BytesPerStorep = 16;
   constexpr uint32_t BytesPerStoreq = 8;
   constexpr uint32_t BytesPerStorei32 = 4;
-  constexpr uint32_t BytesPerStorei16 = 2;
-  constexpr uint32_t BytesPerStorei8 = 1;
   assert(Val->getType() == IceType_i8);
 
   // Check if the operands are constants
@@ -3734,11 +3781,11 @@
   // TODO(ascull): if the count is constant but val is not it would be possible
   // to inline by spreading the value across 4 bytes and accessing subregs e.g.
   // eax, ax and al.
-  if (IsCountConst && IsValConst) {
+  if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
     Variable *Base = nullptr;
+    Variable *VecReg = nullptr;
     const uint32_t SpreadValue =
         (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
-    Variable *VecReg = nullptr;
 
     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
                                                         uint32_t OffsetAmt) {
@@ -3750,7 +3797,7 @@
       if (isVectorType(Ty)) {
         assert(VecReg != nullptr);
         _storep(VecReg, Mem);
-      } else if (Ty == IceType_i64) {
+      } else if (Ty == IceType_f64) {
         assert(VecReg != nullptr);
         _storeq(VecReg, Mem);
       } else {
@@ -3758,63 +3805,45 @@
       }
     };
 
-    // Lowers the assignment to the remaining bytes. Assumes the original size
-    // was large enough to allow for overlaps.
-    auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {
-      if (Size > BytesPerStoreq) {
-        lowerSet(IceType_v16i8, CountValue - BytesPerStorep);
-      } else if (Size > BytesPerStorei32) {
-        lowerSet(IceType_i64, CountValue - BytesPerStoreq);
-      } else if (Size > BytesPerStorei16) {
-        lowerSet(IceType_i32, CountValue - BytesPerStorei32);
-      } else if (Size > BytesPerStorei8) {
-        lowerSet(IceType_i16, CountValue - BytesPerStorei16);
-      } else if (Size == BytesPerStorei8) {
-        lowerSet(IceType_i8, CountValue - BytesPerStorei8);
-      }
-    };
-
-    // When the value is zero it can be loaded into a vector register cheaply
-    // using the xor trick.
+    // Find the largest type that can be used and use it as much as possible in
+    // reverse order. Then handle any remainder with overlapping copies. Since
+    // the remainder will be at the end, there will be reduces pressure on the
+    // memory unit as the access to the same memory are far apart.
+    Type Ty;
     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
-        CountValue <= BytesPerStorep * UNROLL_LIMIT) {
+        CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
+      // When the value is zero it can be loaded into a vector register cheaply
+      // using the xor trick.
       Base = legalizeToReg(Dest);
       VecReg = makeVectorOfZeros(IceType_v16i8);
-
-      // Too small to use large vector operations so use small ones instead
-      if (CountValue < BytesPerStorep) {
-        lowerSet(IceType_i64, 0);
-        lowerLeftOvers(CountValue - BytesPerStoreq);
-        return;
-      }
-
-      // Use large vector operations
-      for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
-        N -= 16;
-        lowerSet(IceType_v16i8, N);
-      }
-      lowerLeftOvers(CountValue & 0xF);
-      return;
+      Ty = largestTypeInSize(CountValue);
+    } else if (CountValue <= BytesPerStorei32 * Traits::MEMCPY_UNROLL_LIMIT) {
+      // When the value is non-zero or the count is small we can't use vector
+      // instructions so are limited to 32-bit stores.
+      Base = legalizeToReg(Dest);
+      constexpr uint32_t MaxSize = 4;
+      Ty = largestTypeInSize(CountValue, MaxSize);
     }
 
-    // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?
-    if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
-      Base = legalizeToReg(Dest);
-      // 3 is the awkward size as it is too small for the vector or 32-bit
-      // operations and will not work with lowerLeftOvers as there is no valid
-      // overlap.
-      if (CountValue == 3) {
-        lowerSet(IceType_i16, 0);
-        lowerSet(IceType_i8, 2);
-        return;
+    if (Base) {
+      uint32_t TyWidth = typeWidthInBytes(Ty);
+
+      uint32_t RemainingBytes = CountValue;
+      uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+      while (RemainingBytes >= TyWidth) {
+        lowerSet(Ty, Offset);
+        RemainingBytes -= TyWidth;
+        Offset -= TyWidth;
       }
 
-      // TODO(ascull); 64-bit can do better with 64-bit mov
-      for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
-        N -= 4;
-        lowerSet(IceType_i32, N);
-      }
-      lowerLeftOvers(CountValue & 0x3);
+      if (RemainingBytes == 0)
+        return;
+
+      // Lower the remaining bytes. Adjust to larger types in order to make use
+      // of overlaps in the copies.
+      Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+      Offset = CountValue - typeWidthInBytes(LeftOverTy);
+      lowerSet(LeftOverTy, Offset);
       return;
     }
   }
@@ -5053,6 +5082,34 @@
   return Reg;
 }
 
+template <class Machine>
+const Type TargetX86Base<Machine>::TypeForSize[] = {
+    IceType_i8, IceType_i16, IceType_i32,
+    (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8};
+template <class Machine>
+Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size,
+                                               uint32_t MaxSize) {
+  assert(Size != 0);
+  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+  uint32_t MaxIndex = MaxSize == NoSizeLimit
+                          ? llvm::array_lengthof(TypeForSize) - 1
+                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
+  return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
+template <class Machine>
+Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size,
+                                                   uint32_t MaxSize) {
+  assert(Size != 0);
+  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+  if (!llvm::isPowerOf2_32(Size))
+    ++TyIndex;
+  uint32_t MaxIndex = MaxSize == NoSizeLimit
+                          ? llvm::array_lengthof(TypeForSize) - 1
+                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
+  return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
 template <class Machine> void TargetX86Base<Machine>::postLower() {
   if (Ctx->getFlags().getOptLevel() == Opt_m1)
     return;
diff --git a/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll
index 8175eab..5e5d7b0 100644
--- a/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll
@@ -4,8 +4,11 @@
 ; RUN:   --target x8632 -i %s --args -O2 -sandbox \
 ; RUN:   | %if --need=target_X8632 --command FileCheck %s
 ; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
-; RUN:   --target x8632 -i %s --args -Om1 -sandbox \
+; RUN:   --target x8632 -i %s --args -Om1 --fmem-intrin-opt -sandbox \
 ; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 -sandbox \
+; RUN:   | %if --need=target_X8632 --command FileCheck --check-prefix OM1 %s
 
 ; RUN: %if --need=target_ARM32 --need=allow_dump \
 ; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
@@ -27,6 +30,8 @@
 }
 ; CHECK-LABEL: test_memcpy
 ; CHECK: call {{.*}} R_{{.*}} memcpy
+; OM1-LABEL: test_memcpy
+; OM1: call  {{.*}} memcpy
 ; ARM32-LABEL: test_memcpy
 ; ARM32: bl {{.*}} memcpy
 
@@ -40,6 +45,8 @@
 }
 ; CHECK-LABEL: test_memcpy_long_const_len
 ; CHECK: call {{.*}} R_{{.*}} memcpy
+; OM1-LABEL: test_memcpy_long_const_len
+; OM1: call {{.*}} memcpy
 ; ARM32-LABEL: test_memcpy_long_const_len
 ; ARM32: bl {{.*}} memcpy
 
@@ -55,6 +62,8 @@
 ; CHECK: mov [[REG:[^,]*]],WORD PTR [{{.*}}]
 ; CHECK-NEXT: mov WORD PTR [{{.*}}],[[REG]]
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memcpy_very_small_const_len
+; OM1: call {{.*}} memcpy
 ; ARM32-LABEL: test_memcpy_very_small_const_len
 ; ARM32: bl {{.*}} memcpy
 
@@ -72,6 +81,8 @@
 ; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x2]
 ; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x2],[[REG]]
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memcpy_const_len_3
+; OM1: call {{.*}} memcpy
 ; ARM32-LABEL: test_memcpy_const_len_3
 ; ARM32: bl {{.*}} memcpy
 
@@ -89,6 +100,8 @@
 ; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x8]
 ; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x8],[[REG]]
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memcpy_mid_const_len
+; OM1: call {{.*}} memcpy
 ; ARM32-LABEL: test_memcpy_mid_const_len
 ; ARM32: bl {{.*}} memcpy
 
@@ -106,10 +119,12 @@
 ; CHECK-NEXT: movq [[REG:xmm[0-9]+]],QWORD PTR [{{.*}}+0x7]
 ; CHECK-NEXT: movq QWORD PTR [{{.*}}+0x7],[[REG]]
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memcpy_mid_const_len_overlap
+; OM1: call {{.*}} memcpy
 ; ARM32-LABEL: test_memcpy_mid_const_len_overlap
 ; ARM32: bl {{.*}} memcpy
 
-define void @test_memcpy_large_const_len_overlap(i32 %iptr_dst, i32 %iptr_src) {
+define void @test_memcpy_big_const_len_overlap(i32 %iptr_dst, i32 %iptr_src) {
 entry:
   %dst = inttoptr i32 %iptr_dst to i8*
   %src = inttoptr i32 %iptr_src to i8*
@@ -117,13 +132,15 @@
                                        i32 30, i32 1, i1 false)
   ret void
 }
-; CHECK-LABEL: test_memcpy_large_const_len_overlap
+; CHECK-LABEL: test_memcpy_big_const_len_overlap
 ; CHECK: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}]
 ; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[REG]]
 ; CHECK-NEXT: movups [[REG:xmm[0-9]+]],XMMWORD PTR [{{.*}}+0xe]
 ; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0xe],[[REG]]
 ; CHECK-NOT: mov
-; ARM32-LABEL: test_memcpy_large_const_len_overlap
+; OM1-LABEL: test_memcpy_big_const_len_overlap
+; OM1: call {{.*}} memcpy
+; ARM32-LABEL: test_memcpy_big_const_len_overlap
 ; ARM32: bl {{.*}} memcpy
 
 define void @test_memcpy_large_const_len(i32 %iptr_dst, i32 %iptr_src) {
@@ -142,35 +159,153 @@
 ; CHECK-NEXT: mov [[REG:[^,]*]],BYTE PTR [{{.*}}+0x20]
 ; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x20],[[REG]]
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memcpy_large_const_len
+; OM1: call {{.*}} memcpy
 ; ARM32-LABEL: test_memcpy_large_const_len
 ; ARM32: bl {{.*}} memcpy
 
-; TODO(jvoung) -- if we want to be clever, we can do memset without a function
-; call similar to memcpy.
 define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
 entry:
   %dst = inttoptr i32 %iptr_dst to i8*
   %src = inttoptr i32 %iptr_src to i8*
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                        i32 %len, i32 1, i1 false)
+                                       i32 %len, i32 1, i1 false)
   ret void
 }
 ; CHECK-LABEL: test_memmove
 ; CHECK: call {{.*}} R_{{.*}} memmove
+; OM1-LABEL: test_memmove
+; OM1: call {{.*}} memmove
 ; ARM32-LABEL: test_memmove
 ; ARM32: bl {{.*}} memmove
 
-define void @test_memmove_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
+define void @test_memmove_long_const_len(i32 %iptr_dst, i32 %iptr_src) {
 entry:
   %dst = inttoptr i32 %iptr_dst to i8*
   %src = inttoptr i32 %iptr_src to i8*
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
-                                        i32 32, i32 1, i1 false)
+                                       i32 4876, i32 1, i1 false)
   ret void
 }
-; CHECK-LABEL: test_memmove_const_len_align
+; CHECK-LABEL: test_memmove_long_const_len
 ; CHECK: call {{.*}} R_{{.*}} memmove
-; ARM32-LABEL: test_memmove_const_len_align
+; OM1-LABEL: test_memmove_long_const_len
+; OM1: call {{.*}} memmove
+; ARM32-LABEL: test_memmove_long_const_len
+; ARM32: bl {{.*}} memmove
+
+define void @test_memmove_very_small_const_len(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 2, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memmove_very_small_const_len
+; CHECK: mov [[REG:[^,]*]],WORD PTR [{{.*}}]
+; CHECK-NEXT: mov WORD PTR [{{.*}}],[[REG]]
+; CHECK-NOT: mov
+; OM1-LABEL: test_memmove_very_small_const_len
+; OM1: call {{.*}} memmove
+; ARM32-LABEL: test_memmove_very_small_const_len
+; ARM32: bl {{.*}} memmove
+
+define void @test_memmove_const_len_3(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 3, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memmove_const_len_3
+; CHECK: mov [[REG0:[^,]*]],WORD PTR [{{.*}}]
+; CHECK-NEXT: mov [[REG1:[^,]*]],BYTE PTR [{{.*}}+0x2]
+; CHECK-NEXT: mov WORD PTR [{{.*}}],[[REG0]]
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x2],[[REG1]]
+; CHECK-NOT: mov
+; OM1-LABEL: test_memmove_const_len_3
+; OM1: call {{.*}} memmove
+; ARM32-LABEL: test_memmove_const_len_3
+; ARM32: bl {{.*}} memmove
+
+define void @test_memmove_mid_const_len(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 9, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memmove_mid_const_len
+; CHECK: movq [[REG0:xmm[0-9]+]],QWORD PTR [{{.*}}]
+; CHECK-NEXT: mov [[REG1:[^,]*]],BYTE PTR [{{.*}}+0x8]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[REG0]]
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x8],[[REG1]]
+; CHECK-NOT: mov
+; OM1-LABEL: test_memmove_mid_const_len
+; OM1: call {{.*}} memmove
+; ARM32-LABEL: test_memmove_mid_const_len
+; ARM32: bl {{.*}} memmove
+
+define void @test_memmove_mid_const_len_overlap(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 15, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memmove_mid_const_len_overlap
+; CHECK: movq [[REG0:xmm[0-9]+]],QWORD PTR [{{.*}}]
+; CHECK-NEXT: movq [[REG1:xmm[0-9]+]],QWORD PTR [{{.*}}+0x7]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[REG0]]
+; CHECK-NEXT: movq QWORD PTR [{{.*}}+0x7],[[REG1]]
+; CHECK-NOT: mov
+; OM1-LABEL: test_memmove_mid_const_len_overlap
+; OM1: call {{.*}} memmove
+; ARM32-LABEL: test_memmove_mid_const_len_overlap
+; ARM32: bl {{.*}} memmove
+
+define void @test_memmove_big_const_len_overlap(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 30, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memmove_big_const_len_overlap
+; CHECK: movups [[REG0:xmm[0-9]+]],XMMWORD PTR [{{.*}}]
+; CHECK-NEXT: movups [[REG1:xmm[0-9]+]],XMMWORD PTR [{{.*}}+0xe]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[REG0]]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0xe],[[REG1]]
+; CHECK-NOT: mov
+; OM1-LABEL: test_memmove_big_const_len_overlap
+; OM1: call {{.*}} memmove
+; ARM32-LABEL: test_memmove_big_const_len_overlap
+; ARM32: bl {{.*}} memmove
+
+define void @test_memmove_large_const_len(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 33, i32 1, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_memmove_large_const_len
+; CHECK: movups [[REG0:xmm[0-9]+]],XMMWORD PTR [{{.*}}+0x10]
+; CHECK-NEXT: movups [[REG1:xmm[0-9]+]],XMMWORD PTR [{{.*}}]
+; CHECK-NEXT: mov [[REG2:[^,]*]],BYTE PTR [{{.*}}+0x20]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0x10],[[REG0]]
+; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[REG1]]
+; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x20],[[REG2]]
+; CHECK-NOT: mov
+; OM1-LABEL: test_memmove_large_const_len
+; OM1: call {{.*}} memmove
+; ARM32-LABEL: test_memmove_large_const_len
 ; ARM32: bl {{.*}} memmove
 
 define void @test_memset(i32 %iptr_dst, i32 %wide_val, i32 %len) {
@@ -184,6 +319,9 @@
 ; CHECK-LABEL: test_memset
 ; CHECK: movzx
 ; CHECK: call {{.*}} R_{{.*}} memset
+; OM1-LABEL: test_memset
+; OM1: movzx
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
@@ -199,6 +337,9 @@
 ; CHECK-LABEL: test_memset_const_len_align
 ; CHECK: movzx
 ; CHECK: call {{.*}} R_{{.*}} memset
+; OM1-LABEL: test_memset_const_len_align
+; OM1: movzx
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_const_len_align
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
@@ -212,6 +353,8 @@
 }
 ; CHECK-LABEL: test_memset_long_const_len_zero_val_align
 ; CHECK: call {{.*}} R_{{.*}} memset
+; OM1-LABEL: test_memset_long_const_len_zero_val_align
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_long_const_len_zero_val_align
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
@@ -225,6 +368,8 @@
 ; CHECK-LABEL: test_memset_const_val
 ; CHECK-NOT: movzx
 ; CHECK: call {{.*}} R_{{.*}} memset
+; OM1-LABEL: test_memset_const_val
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_const_val
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
@@ -238,6 +383,8 @@
 ; CHECK-LABEL: test_memset_const_val_len_very_small
 ; CHECK: mov WORD PTR [{{.*}}],0xa0a
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memset_const_val_len_very_small
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_const_val_len_very_small
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
@@ -252,6 +399,8 @@
 ; CHECK: mov WORD PTR [{{.*}}],0x1010
 ; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x2],0x10
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memset_const_val_len_3
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_const_val_len_3
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
@@ -267,6 +416,8 @@
 ; CHECK: mov DWORD PTR [{{.*}}],0x20202020
 ; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x8],0x20
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memset_const_val_len_mid
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_const_val_len_mid
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
@@ -282,6 +433,8 @@
 ; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[ZERO]]
 ; CHECK-NEXT: mov DWORD PTR [{{.*}}+0x8],0x0
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memset_zero_const_len_small
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_zero_const_len_small
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
@@ -297,22 +450,26 @@
 ; CHECK-NEXT: movq QWORD PTR [{{.*}}],[[ZERO]]
 ; CHECK-NEXT: movq QWORD PTR [{{.*}}+0x7],[[ZERO]]
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memset_zero_const_len_small_overlap
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_zero_const_len_small_overlap
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
 
-define void @test_memset_zero_const_len_large_overlap(i32 %iptr_dst) {
+define void @test_memset_zero_const_len_big_overlap(i32 %iptr_dst) {
 entry:
   %dst = inttoptr i32 %iptr_dst to i8*
   call void @llvm.memset.p0i8.i32(i8* %dst, i8 0, i32 30, i32 1, i1 false)
   ret void
 }
-; CHECK-LABEL: test_memset_zero_const_len_large_overlap
+; CHECK-LABEL: test_memset_zero_const_len_big_overlap
 ; CHECK: pxor [[ZERO:xmm[0-9]+]],[[ZERO]]
 ; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[ZERO]]
 ; CHECK-NEXT: movups XMMWORD PTR [{{.*}}+0xe],[[ZERO]]
 ; CHECK-NOT: mov
-; ARM32-LABEL: test_memset_zero_const_len_large_overlap
+; OM1-LABEL: test_memset_zero_const_len_big_overlap
+; OM1: call {{.*}} R_{{.*}} memset
+; ARM32-LABEL: test_memset_zero_const_len_big_overlap
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
 
@@ -328,6 +485,8 @@
 ; CHECK-NEXT: movups XMMWORD PTR [{{.*}}],[[ZERO]]
 ; CHECK-NEXT: mov BYTE PTR [{{.*}}+0x20],0x0
 ; CHECK-NOT: mov
+; OM1-LABEL: test_memset_zero_const_len_large
+; OM1: call {{.*}} R_{{.*}} memset
 ; ARM32-LABEL: test_memset_zero_const_len_large
 ; ARM32: uxtb
 ; ARM32: bl {{.*}} memset
diff --git a/unittest/AssemblerX8632/Locked.cpp b/unittest/AssemblerX8632/Locked.cpp
index 82c1e14..9be0f94 100644
--- a/unittest/AssemblerX8632/Locked.cpp
+++ b/unittest/AssemblerX8632/Locked.cpp
@@ -82,6 +82,57 @@
 #undef TestImpl
 #undef TestImplSize
 #undef TestImplAddrReg
+
+#define TestImplRegReg(Reg0, Value0, Reg1, Value1, Size)                       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Reg0 "," #Value0 ", " #Reg1 ", " #Value1 ", " #Size ")";          \
+    const uint32_t V0 = (Value0)&Mask##Size;                                   \
+    const uint32_t V1 = (Value1)&Mask##Size;                                   \
+                                                                               \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Reg0,                   \
+           Immediate(Value0));                                                 \
+    __ mov(IceType_i##Size, GPRRegister::Encoded_Reg_##Reg1,                   \
+           Immediate(Value1));                                                 \
+    __ xchg(IceType_i##Size, GPRRegister::Encoded_Reg_##Reg0,                  \
+            GPRRegister::Encoded_Reg_##Reg1);                                  \
+    __ And(IceType_i32, GPRRegister::Encoded_Reg_##Reg0,                       \
+           Immediate(Mask##Size));                                             \
+    __ And(IceType_i32, GPRRegister::Encoded_Reg_##Reg1,                       \
+           Immediate(Mask##Size));                                             \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Reg1()) << TestString;                                  \
+    ASSERT_EQ(V1, test.Reg0()) << TestString;                                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Reg0, Reg1, Size)                                         \
+  do {                                                                         \
+    TestImplRegReg(Reg0, 0xa2b34567, Reg1, 0x0507ddee, Size);                  \
+  } while (0)
+
+#define TestImpl(Reg0, Reg1)                                                   \
+  do {                                                                         \
+    if (GPRRegister::Encoded_Reg_##Reg0 < 4 &&                                 \
+        GPRRegister::Encoded_Reg_##Reg1 < 4) {                                 \
+      TestImplSize(Reg0, Reg1, 8);                                             \
+    }                                                                          \
+    TestImplSize(Reg0, Reg1, 16);                                              \
+    TestImplSize(Reg0, Reg1, 32);                                              \
+  } while (0)
+
+  TestImpl(eax, ebx);
+  TestImpl(edx, eax);
+  TestImpl(ecx, edx);
+  TestImpl(esi, eax);
+  TestImpl(edx, edi);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplRegReg
 }
 
 TEST_F(AssemblerX8632Test, Xadd) {
diff --git a/unittest/AssemblerX8664/Locked.cpp b/unittest/AssemblerX8664/Locked.cpp
index f93f2d2..086bd04 100644
--- a/unittest/AssemblerX8664/Locked.cpp
+++ b/unittest/AssemblerX8664/Locked.cpp
@@ -85,6 +85,54 @@
 #undef TestImpl
 #undef TestImplSize
 #undef TestImplAddrReg
+
+#define TestImplRegReg(Reg0, Value0, Reg1, Value1, Size)                       \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Reg0 "," #Value0 ", " #Reg1 ", " #Value1 ", " #Size ")";          \
+    const uint32_t V0 = (Value0)&Mask##Size;                                   \
+    const uint32_t V1 = (Value1)&Mask##Size;                                   \
+                                                                               \
+    __ mov(IceType_i##Size, Encoded_GPR_##Reg0(), Immediate(Value0));          \
+    __ mov(IceType_i##Size, Encoded_GPR_##Reg1(), Immediate(Value1));          \
+    __ xchg(IceType_i##Size, Encoded_GPR_##Reg0(), Encoded_GPR_##Reg1());      \
+    __ And(IceType_i32, Encoded_GPR_##Reg0(), Immediate(Mask##Size));          \
+    __ And(IceType_i32, Encoded_GPR_##Reg1(), Immediate(Mask##Size));          \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.Reg1()) << TestString;                                  \
+    ASSERT_EQ(V1, test.Reg0()) << TestString;                                  \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplSize(Reg0, Reg1, Size)                                         \
+  do {                                                                         \
+    TestImplRegReg(Reg0, 0xa2b34567, Reg1, 0x0507ddee, Size);                  \
+  } while (0)
+
+#define TestImpl(Reg0, Reg1)                                                   \
+  do {                                                                         \
+    TestImplSize(Reg0, Reg1, 8);                                               \
+    TestImplSize(Reg0, Reg1, 16);                                              \
+    TestImplSize(Reg0, Reg1, 32);                                              \
+  } while (0)
+
+  // r1 == rax so has a short encoding
+  TestImpl(r6, r1);
+  TestImpl(r1, r8);
+
+  TestImpl(r2, r10);
+  TestImpl(r3, r11);
+  TestImpl(r4, r12);
+  TestImpl(r5, r13);
+  TestImpl(r6, r14);
+  TestImpl(r7, r15);
+
+#undef TestImpl
+#undef TestImplSize
+#undef TestImplRegReg
 }
 
 TEST_F(AssemblerX8664Test, Xadd) {