Inline memove for small constant sizes and refactor memcpy and memset.

The memory intrinsics are only optimized at -O1 and higher unless the
-fmem-intrin-opt flag is set to force to optimization to take place.

This change also introduces the xchg instruction for two register operands. This
is no longer used in the memory intrinsic lowering (or by anything else) but the
implementation is left for future use.

BUG=
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/1278173009.
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index c34b776..ba80fd8 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -845,6 +845,8 @@
   void cmpxchg8b(const typename Traits::Address &address, bool Locked);
   void xadd(Type Ty, const typename Traits::Address &address,
             typename Traits::GPRRegister reg, bool Locked);
+  void xchg(Type Ty, typename Traits::GPRRegister reg0,
+            typename Traits::GPRRegister reg1);
   void xchg(Type Ty, const typename Traits::Address &address,
             typename Traits::GPRRegister reg);
 
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 2cb039a..ad5d13a 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -3150,6 +3150,29 @@
 }
 
 template <class Machine>
+void AssemblerX86Base<Machine>::xchg(Type Ty, typename Traits::GPRRegister reg0,
+                                     typename Traits::GPRRegister reg1) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  if (Ty == IceType_i16)
+    emitOperandSizeOverride();
+  // Use short form if either register is EAX.
+  if (reg0 == Traits::Encoded_Reg_Accumulator) {
+    emitRexB(Ty, reg1);
+    emitUint8(0x90 + gprEncoding(reg1));
+  } else if (reg1 == Traits::Encoded_Reg_Accumulator) {
+    emitRexB(Ty, reg0);
+    emitUint8(0x90 + gprEncoding(reg0));
+  } else {
+    emitRexRB(Ty, reg0, reg1);
+    if (isByteSizedArithType(Ty))
+      emitUint8(0x86);
+    else
+      emitUint8(0x87);
+    emitRegisterOperand(gprEncoding(reg0), gprEncoding(reg1));
+  }
+}
+
+template <class Machine>
 void AssemblerX86Base<Machine>::xchg(Type Ty,
                                      const typename Traits::Address &addr,
                                      typename Traits::GPRRegister reg) {
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index ec7f3bb..fea785d 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -83,6 +83,10 @@
     cl::init(false));
 
 cl::opt<bool>
+    ForceMemIntrinOpt("fmem-intrin-opt",
+                      cl::desc("Force optimization of memory intrinsics."));
+
+cl::opt<bool>
     FunctionSections("ffunction-sections",
                      cl::desc("Emit functions into separate sections"));
 
@@ -356,6 +360,7 @@
   OutFlags.DisableTranslation = false;
   OutFlags.DumpStats = false;
   OutFlags.EnableBlockProfile = false;
+  OutFlags.ForceMemIntrinOpt = false;
   OutFlags.FunctionSections = false;
   OutFlags.GenerateUnitTestMessages = false;
   OutFlags.PhiEdgeSplit = false;
@@ -416,6 +421,7 @@
   OutFlags.setDisableTranslation(::DisableTranslation);
   OutFlags.setDumpStats(::DumpStats);
   OutFlags.setEnableBlockProfile(::EnableBlockProfile);
+  OutFlags.setForceMemIntrinOpt(::ForceMemIntrinOpt);
   OutFlags.setFunctionSections(::FunctionSections);
   OutFlags.setNumTranslationThreads(::NumThreads);
   OutFlags.setOptLevel(::OLevel);
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 2ce3b4e..f71d8f4 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -72,6 +72,9 @@
   bool getEnableBlockProfile() const { return EnableBlockProfile; }
   void setEnableBlockProfile(bool NewValue) { EnableBlockProfile = NewValue; }
 
+  bool getForceMemIntrinOpt() const { return ForceMemIntrinOpt; }
+  void setForceMemIntrinOpt(bool NewValue) { ForceMemIntrinOpt = NewValue; }
+
   bool getFunctionSections() const { return FunctionSections; }
   void setFunctionSections(bool NewValue) { FunctionSections = NewValue; }
 
@@ -241,6 +244,7 @@
   bool DisableTranslation;
   bool DumpStats;
   bool EnableBlockProfile;
+  bool ForceMemIntrinOpt;
   bool FunctionSections;
   bool GenerateUnitTestMessages;
   bool PhiEdgeSplit;
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 4d26210..336e268 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -3202,19 +3202,29 @@
   typename InstX86Base<Machine>::Traits::Assembler *Asm =
       Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
   Type Ty = this->getSrc(0)->getType();
-  const auto Mem =
+  const auto *VarReg1 = llvm::cast<Variable>(this->getSrc(1));
+  assert(VarReg1->hasReg());
+  const typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister Reg1 =
+      InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
+          VarReg1->getRegNum());
+
+  if (const auto *VarReg0 = llvm::dyn_cast<Variable>(this->getSrc(0))) {
+    assert(VarReg0->hasReg());
+    const typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister Reg0 =
+        InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
+            VarReg0->getRegNum());
+    Asm->xchg(Ty, Reg0, Reg1);
+    return;
+  }
+
+  const auto *Mem =
       llvm::cast<typename InstX86Base<Machine>::Traits::X86OperandMem>(
           this->getSrc(0));
   assert(Mem->getSegmentRegister() ==
          InstX86Base<Machine>::Traits::X86OperandMem::DefaultSegment);
   const typename InstX86Base<Machine>::Traits::Address Addr =
       Mem->toAsmAddress(Asm);
-  const auto VarReg = llvm::cast<Variable>(this->getSrc(1));
-  assert(VarReg->hasReg());
-  const typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister Reg =
-      InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
-          VarReg->getRegNum());
-  Asm->xchg(Ty, Addr, Reg);
+  Asm->xchg(Ty, Addr, Reg1);
 }
 
 template <class Machine>
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index d1ca570..3e93b76 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -440,6 +440,11 @@
   return Call;
 }
 
+bool TargetLowering::shouldOptimizeMemIntrins() {
+  return Ctx->getFlags().getOptLevel() >= Opt_1 ||
+         Ctx->getFlags().getForceMemIntrinOpt();
+}
+
 void TargetLowering::emitWithoutPrefix(const ConstantRelocatable *C) const {
   if (!BuildDefs::dump())
     return;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 2d03a76..a5e6064 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -351,6 +351,8 @@
     Context.getLastInserted()->setDestNonKillable();
   }
 
+  bool shouldOptimizeMemIntrins();
+
   Cfg *Func;
   GlobalContext *Ctx;
   bool HasComputedFrame = false;
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index e0acbd6..1d47d3a 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -441,6 +441,13 @@
   /// The number of different NOP instructions
   static const uint32_t X86_NUM_NOP_VARIANTS = 5;
 
+  /// \name Limits for unrolling memory intrinsics.
+  /// @{
+  static constexpr uint32_t MEMCPY_UNROLL_LIMIT = 8;
+  static constexpr uint32_t MEMMOVE_UNROLL_LIMIT = 8;
+  static constexpr uint32_t MEMSET_UNROLL_LIMIT = 16;
+  /// @}
+
   /// Value is in bytes. Return Value adjusted to the next highest multiple
   /// of the stack alignment.
   static uint32_t applyStackAlignment(uint32_t Value) {
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index 4a12004..7cb85e0 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -456,6 +456,13 @@
   /// The number of different NOP instructions
   static const uint32_t X86_NUM_NOP_VARIANTS = 5;
 
+  /// \name Limits for unrolling memory intrinsics.
+  /// @{
+  static constexpr uint32_t MEMCPY_UNROLL_LIMIT = 8;
+  static constexpr uint32_t MEMMOVE_UNROLL_LIMIT = 8;
+  static constexpr uint32_t MEMSET_UNROLL_LIMIT = 16;
+  /// @}
+
   /// Value is in bytes. Return Value adjusted to the next highest multiple
   /// of the stack alignment.
   static uint32_t applyStackAlignment(uint32_t Value) {
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index da863f4..31bc35b 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -192,9 +192,17 @@
                       Operand *Val);
   void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
                        Operand *SecondVal);
-  /// Replace a call to memcpy with inline instructions.
+  /// Load from memory for a given type.
+  void typedLoad(Type Ty, Variable *Dest, Variable *Base, Constant *Offset);
+  /// Store to memory for a given type.
+  void typedStore(Type Ty, Variable *Value, Variable *Base, Constant *Offset);
+  /// Copy memory of given type from Src to Dest using OffsetAmt on both.
+  void copyMemory(Type Ty, Variable *Dest, Variable *Src, int32_t OffsetAmt);
+  /// Replace some calls to memcpy with inline instructions.
   void lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count);
-  /// Replace a call to memset with inline instructions.
+  /// Replace some calls to memmove with inline instructions.
+  void lowerMemmove(Operand *Dest, Operand *Src, Operand *Count);
+  /// Replace some calls to memset with inline instructions.
   void lowerMemset(Operand *Dest, Operand *Val, Operand *Count);
 
   /// Lower an indirect jump adding sandboxing when needed.
@@ -251,6 +259,19 @@
   Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
   static Type stackSlotType();
 
+  static constexpr uint32_t NoSizeLimit = 0;
+  static const Type TypeForSize[];
+  /// Returns the largest type which is equal to or larger than Size bytes. The
+  /// type is suitable for copying memory i.e. a load and store will be a
+  /// single instruction (for example x86 will get f64 not i64).
+  static Type largestTypeInSize(uint32_t Size, uint32_t MaxSize = NoSizeLimit);
+  /// Returns the smallest type which is equal to or larger than Size bytes. If
+  /// one doesn't exist then the largest type smaller than Size bytes is
+  /// returned. The type is suitable for memory copies as described at
+  /// largestTypeInSize.
+  static Type firstTypeThatFitsSize(uint32_t Size,
+                                    uint32_t MaxSize = NoSizeLimit);
+
   Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
 
   /// \name Returns a vector in a register with the given constant entries.
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index e190b5d..5e46c98 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -3153,11 +3153,7 @@
     return;
   }
   case Intrinsics::Memmove: {
-    InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(Instr->getArg(1));
-    Call->addArg(Instr->getArg(2));
-    lowerCall(Call);
+    lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
     return;
   }
   case Intrinsics::Memset: {
@@ -3600,22 +3596,55 @@
 }
 
 template <class Machine>
+void TargetX86Base<Machine>::typedLoad(Type Ty, Variable *Dest, Variable *Base,
+                                       Constant *Offset) {
+  auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
+
+  if (isVectorType(Ty))
+    _movp(Dest, Mem);
+  else if (Ty == IceType_f64)
+    _movq(Dest, Mem);
+  else
+    _mov(Dest, Mem);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value,
+                                        Variable *Base, Constant *Offset) {
+  auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
+
+  if (isVectorType(Ty))
+    _storep(Value, Mem);
+  else if (Ty == IceType_f64)
+    _storeq(Value, Mem);
+  else
+    _store(Value, Mem);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::copyMemory(Type Ty, Variable *Dest, Variable *Src,
+                                        int32_t OffsetAmt) {
+  Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
+  // TODO(ascull): this or add nullptr test to _movp, _movq
+  Variable *Data = makeReg(Ty);
+
+  typedLoad(Ty, Data, Src, Offset);
+  typedStore(Ty, Data, Dest, Offset);
+}
+
+template <class Machine>
 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
                                          Operand *Count) {
   // There is a load and store for each chunk in the unroll
-  constexpr uint32_t UNROLL_LIMIT = 8;
   constexpr uint32_t BytesPerStorep = 16;
-  constexpr uint32_t BytesPerStoreq = 8;
-  constexpr uint32_t BytesPerStorei32 = 4;
-  constexpr uint32_t BytesPerStorei16 = 2;
-  constexpr uint32_t BytesPerStorei8 = 1;
 
   // Check if the operands are constants
   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
   const bool IsCountConst = CountConst != nullptr;
   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
 
-  if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {
+  if (shouldOptimizeMemIntrins() && IsCountConst &&
+      CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
     // Unlikely, but nothing to do if it does happen
     if (CountValue == 0)
       return;
@@ -3623,80 +3652,29 @@
     Variable *SrcBase = legalizeToReg(Src);
     Variable *DestBase = legalizeToReg(Dest);
 
-    auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {
-      Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
-      // TODO(ascull): this or add nullptr test to _movp, _movq
-      Variable *Data = makeReg(Ty);
+    // Find the largest type that can be used and use it as much as possible in
+    // reverse order. Then handle any remainder with overlapping copies. Since
+    // the remainder will be at the end, there will be reduced pressure on the
+    // memory unit as the accesses to the same memory are far apart.
+    Type Ty = largestTypeInSize(CountValue);
+    uint32_t TyWidth = typeWidthInBytes(Ty);
 
-      // TODO(ascull): is 64-bit better with vector or scalar movq?
-      auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);
-      if (isVectorType(Ty))
-        _movp(Data, SrcMem);
-      else if (Ty == IceType_f64)
-        _movq(Data, SrcMem);
-      else
-        _mov(Data, SrcMem);
-
-      auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
-      if (isVectorType(Ty))
-        _storep(Data, DestMem);
-      else if (Ty == IceType_f64)
-        _storeq(Data, DestMem);
-      else
-        _store(Data, DestMem);
-    };
-
-    // Lowers the assignment to the remaining bytes. Assumes the original size
-    // was large enough to allow for overlaps.
-    auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
-      if (Size > BytesPerStoreq) {
-        lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
-      } else if (Size > BytesPerStorei32) {
-        lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
-      } else if (Size > BytesPerStorei16) {
-        lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
-      } else if (Size > BytesPerStorei8) {
-        lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
-      } else if (Size == BytesPerStorei8) {
-        lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
-      }
-    };
-
-    if (CountValue >= BytesPerStorep) {
-      // Use large vector operations
-      for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
-        N -= BytesPerStorep;
-        lowerCopy(IceType_v16i8, N);
-      }
-      lowerLeftOvers(CountValue & 0xF);
-      return;
+    uint32_t RemainingBytes = CountValue;
+    int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+    while (RemainingBytes >= TyWidth) {
+      copyMemory(Ty, DestBase, SrcBase, Offset);
+      RemainingBytes -= TyWidth;
+      Offset -= TyWidth;
     }
 
-    // Too small to use large vector operations so use small ones instead
-    if (CountValue >= BytesPerStoreq) {
-      lowerCopy(IceType_f64, 0);
-      lowerLeftOvers(CountValue - BytesPerStoreq);
+    if (RemainingBytes == 0)
       return;
-    }
 
-    // Too small for vector operations so use scalar ones
-    if (CountValue >= BytesPerStorei32) {
-      lowerCopy(IceType_i32, 0);
-      lowerLeftOvers(CountValue - BytesPerStorei32);
-      return;
-    }
-
-    // 3 is the awkward size as it is too small for the vector or 32-bit
-    // operations and will not work with lowerLeftOvers as there is no valid
-    // overlap.
-    if (CountValue == 3) {
-      lowerCopy(IceType_i16, 0);
-      lowerCopy(IceType_i8, 2);
-      return;
-    }
-
-    // 1 or 2 can be done in a single scalar copy
-    lowerLeftOvers(CountValue);
+    // Lower the remaining bytes. Adjust to larger types in order to make use
+    // of overlaps in the copies.
+    Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+    Offset = CountValue - typeWidthInBytes(LeftOverTy);
+    copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
     return;
   }
 
@@ -3709,14 +3687,83 @@
 }
 
 template <class Machine>
+void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src,
+                                          Operand *Count) {
+  // There is a load and store for each chunk in the unroll
+  constexpr uint32_t BytesPerStorep = 16;
+
+  // Check if the operands are constants
+  const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+  const bool IsCountConst = CountConst != nullptr;
+  const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+
+  if (shouldOptimizeMemIntrins() && IsCountConst &&
+      CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
+    // Unlikely, but nothing to do if it does happen
+    if (CountValue == 0)
+      return;
+
+    Variable *SrcBase = legalizeToReg(Src);
+    Variable *DestBase = legalizeToReg(Dest);
+
+    std::tuple<Type, Constant *, Variable *>
+        Moves[Traits::MEMMOVE_UNROLL_LIMIT];
+    Constant *Offset;
+    Variable *Reg;
+
+    // Copy the data into registers as the source and destination could overlap
+    // so make sure not to clobber the memory. This also means overlapping moves
+    // can be used as we are taking a safe snapshot of the memory.
+    Type Ty = largestTypeInSize(CountValue);
+    uint32_t TyWidth = typeWidthInBytes(Ty);
+
+    uint32_t RemainingBytes = CountValue;
+    int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
+    size_t N = 0;
+    while (RemainingBytes >= TyWidth) {
+      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
+      Offset = Ctx->getConstantInt32(OffsetAmt);
+      Reg = makeReg(Ty);
+      typedLoad(Ty, Reg, SrcBase, Offset);
+      RemainingBytes -= TyWidth;
+      OffsetAmt -= TyWidth;
+      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
+    }
+
+    if (RemainingBytes != 0) {
+      // Lower the remaining bytes. Adjust to larger types in order to make use
+      // of overlaps in the copies.
+      assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
+      Ty = firstTypeThatFitsSize(RemainingBytes);
+      Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
+      Reg = makeReg(Ty);
+      typedLoad(Ty, Reg, SrcBase, Offset);
+      Moves[N++] = std::make_tuple(Ty, Offset, Reg);
+    }
+
+    // Copy the data out into the destination memory
+    for (size_t i = 0; i < N; ++i) {
+      std::tie(Ty, Offset, Reg) = Moves[i];
+      typedStore(Ty, Reg, DestBase, Offset);
+    }
+
+    return;
+  }
+
+  // Fall back on a function call
+  InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
+  Call->addArg(Dest);
+  Call->addArg(Src);
+  Call->addArg(Count);
+  lowerCall(Call);
+}
+
+template <class Machine>
 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
                                          Operand *Count) {
-  constexpr uint32_t UNROLL_LIMIT = 16;
   constexpr uint32_t BytesPerStorep = 16;
   constexpr uint32_t BytesPerStoreq = 8;
   constexpr uint32_t BytesPerStorei32 = 4;
-  constexpr uint32_t BytesPerStorei16 = 2;
-  constexpr uint32_t BytesPerStorei8 = 1;
   assert(Val->getType() == IceType_i8);
 
   // Check if the operands are constants
@@ -3734,11 +3781,11 @@
   // TODO(ascull): if the count is constant but val is not it would be possible
   // to inline by spreading the value across 4 bytes and accessing subregs e.g.
   // eax, ax and al.
-  if (IsCountConst && IsValConst) {
+  if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
     Variable *Base = nullptr;
+    Variable *VecReg = nullptr;
     const uint32_t SpreadValue =
         (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
-    Variable *VecReg = nullptr;
 
     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
                                                         uint32_t OffsetAmt) {
@@ -3750,7 +3797,7 @@
       if (isVectorType(Ty)) {
         assert(VecReg != nullptr);
         _storep(VecReg, Mem);
-      } else if (Ty == IceType_i64) {
+      } else if (Ty == IceType_f64) {
         assert(VecReg != nullptr);
         _storeq(VecReg, Mem);
       } else {
@@ -3758,63 +3805,45 @@
       }
     };
 
-    // Lowers the assignment to the remaining bytes. Assumes the original size
-    // was large enough to allow for overlaps.
-    auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {
-      if (Size > BytesPerStoreq) {
-        lowerSet(IceType_v16i8, CountValue - BytesPerStorep);
-      } else if (Size > BytesPerStorei32) {
-        lowerSet(IceType_i64, CountValue - BytesPerStoreq);
-      } else if (Size > BytesPerStorei16) {
-        lowerSet(IceType_i32, CountValue - BytesPerStorei32);
-      } else if (Size > BytesPerStorei8) {
-        lowerSet(IceType_i16, CountValue - BytesPerStorei16);
-      } else if (Size == BytesPerStorei8) {
-        lowerSet(IceType_i8, CountValue - BytesPerStorei8);
-      }
-    };
-
-    // When the value is zero it can be loaded into a vector register cheaply
-    // using the xor trick.
+    // Find the largest type that can be used and use it as much as possible in
+    // reverse order. Then handle any remainder with overlapping copies. Since
+    // the remainder will be at the end, there will be reduces pressure on the
+    // memory unit as the access to the same memory are far apart.
+    Type Ty;
     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
-        CountValue <= BytesPerStorep * UNROLL_LIMIT) {
+        CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
+      // When the value is zero it can be loaded into a vector register cheaply
+      // using the xor trick.
       Base = legalizeToReg(Dest);
       VecReg = makeVectorOfZeros(IceType_v16i8);
-
-      // Too small to use large vector operations so use small ones instead
-      if (CountValue < BytesPerStorep) {
-        lowerSet(IceType_i64, 0);
-        lowerLeftOvers(CountValue - BytesPerStoreq);
-        return;
-      }
-
-      // Use large vector operations
-      for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
-        N -= 16;
-        lowerSet(IceType_v16i8, N);
-      }
-      lowerLeftOvers(CountValue & 0xF);
-      return;
+      Ty = largestTypeInSize(CountValue);
+    } else if (CountValue <= BytesPerStorei32 * Traits::MEMCPY_UNROLL_LIMIT) {
+      // When the value is non-zero or the count is small we can't use vector
+      // instructions so are limited to 32-bit stores.
+      Base = legalizeToReg(Dest);
+      constexpr uint32_t MaxSize = 4;
+      Ty = largestTypeInSize(CountValue, MaxSize);
     }
 
-    // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?
-    if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
-      Base = legalizeToReg(Dest);
-      // 3 is the awkward size as it is too small for the vector or 32-bit
-      // operations and will not work with lowerLeftOvers as there is no valid
-      // overlap.
-      if (CountValue == 3) {
-        lowerSet(IceType_i16, 0);
-        lowerSet(IceType_i8, 2);
-        return;
+    if (Base) {
+      uint32_t TyWidth = typeWidthInBytes(Ty);
+
+      uint32_t RemainingBytes = CountValue;
+      uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+      while (RemainingBytes >= TyWidth) {
+        lowerSet(Ty, Offset);
+        RemainingBytes -= TyWidth;
+        Offset -= TyWidth;
       }
 
-      // TODO(ascull); 64-bit can do better with 64-bit mov
-      for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
-        N -= 4;
-        lowerSet(IceType_i32, N);
-      }
-      lowerLeftOvers(CountValue & 0x3);
+      if (RemainingBytes == 0)
+        return;
+
+      // Lower the remaining bytes. Adjust to larger types in order to make use
+      // of overlaps in the copies.
+      Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+      Offset = CountValue - typeWidthInBytes(LeftOverTy);
+      lowerSet(LeftOverTy, Offset);
       return;
     }
   }
@@ -5053,6 +5082,34 @@
   return Reg;
 }
 
+template <class Machine>
+const Type TargetX86Base<Machine>::TypeForSize[] = {
+    IceType_i8, IceType_i16, IceType_i32,
+    (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8};
+template <class Machine>
+Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size,
+                                               uint32_t MaxSize) {
+  assert(Size != 0);
+  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+  uint32_t MaxIndex = MaxSize == NoSizeLimit
+                          ? llvm::array_lengthof(TypeForSize) - 1
+                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
+  return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
+template <class Machine>
+Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size,
+                                                   uint32_t MaxSize) {
+  assert(Size != 0);
+  uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+  if (!llvm::isPowerOf2_32(Size))
+    ++TyIndex;
+  uint32_t MaxIndex = MaxSize == NoSizeLimit
+                          ? llvm::array_lengthof(TypeForSize) - 1
+                          : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
+  return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
 template <class Machine> void TargetX86Base<Machine>::postLower() {
   if (Ctx->getFlags().getOptLevel() == Opt_m1)
     return;