diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 9eb934b..498c5e9 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -457,19 +457,17 @@
     return;
   // Sort by decreasing alignment.  This does not really matter at the moment,
   // but will allow compacting stack allocation when we fuse to one alloca.
-  std::sort(Allocas.begin(), Allocas.end(),
-            [](Inst *I1, Inst *I2) {
-              auto *A1 = llvm::dyn_cast<InstAlloca>(I1);
-              auto *A2 = llvm::dyn_cast<InstAlloca>(I2);
-              return A1->getAlignInBytes() > A2->getAlignInBytes();
-            });
-  for (Inst *Instr: Allocas) {
+  std::sort(Allocas.begin(), Allocas.end(), [](Inst *I1, Inst *I2) {
+    auto *A1 = llvm::dyn_cast<InstAlloca>(I1);
+    auto *A2 = llvm::dyn_cast<InstAlloca>(I2);
+    return A1->getAlignInBytes() > A2->getAlignInBytes();
+  });
+  for (Inst *Instr : Allocas) {
     auto *Alloca = llvm::cast<InstAlloca>(Instr);
     // Move the alloca to its sorted position.
-    InstAlloca *NewAlloca = InstAlloca::create(this,
-                                               Alloca->getSizeInBytes(),
-                                               Alloca->getAlignInBytes(),
-                                               Alloca->getDest());
+    InstAlloca *NewAlloca =
+        InstAlloca::create(this, Alloca->getSizeInBytes(),
+                           Alloca->getAlignInBytes(), Alloca->getDest());
     if (IsKnownFrameOffset)
       NewAlloca->setKnownFrameOffset();
     Insts.push_front(NewAlloca);
@@ -506,8 +504,7 @@
         // Allocations aligned more than the stack require a frame pointer.
         RequiresFramePointer = true;
         AlignedAllocas.push_back(Alloca);
-      }
-      else
+      } else
         FixedAllocas.push_back(Alloca);
     }
   }
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index e0628c4..004a0b3 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -304,6 +304,7 @@
   PhiDesc() = delete;
   PhiDesc(const PhiDesc &) = delete;
   PhiDesc &operator=(const PhiDesc &) = delete;
+
 public:
   PhiDesc(InstPhi *Phi, Variable *Dest) : Phi(Phi), Dest(Dest) {}
   PhiDesc(PhiDesc &&) = default;
@@ -457,7 +458,7 @@
         if (Item2.Processed)
           continue;
         // There shouldn't be two different Phis with the same Dest variable or
-          // register.
+        // register.
         assert((&Item == &Item2) || !sameVarOrReg(Target, Dest, Item2.Dest));
         if (sameVarOrReg(Target, Dest, Item2.Src))
           ++Item.NumPred;
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index dd70082..b71b0cb 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -55,13 +55,13 @@
   X(Reg_bl, 3, "bl", Reg_ebx, 0,1,0,0, 1,0,0,0,1, 0, 0,0,0,1,1,                \
     REGLIST2(RegX8632, ebx, bx))                                               \
   /* High 8-bit registers */                                                   \
-  X(Reg_ah, 4, "ah", Reg_eax, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_ah, 4, "ah", Reg_eax, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
     REGLIST2(RegX8632, eax, ax))                                               \
-  X(Reg_ch, 5, "ch", Reg_ecx, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_ch, 5, "ch", Reg_ecx, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
     REGLIST2(RegX8632, ecx, cx))                                               \
-  X(Reg_dh, 6, "dh", Reg_edx, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_dh, 6, "dh", Reg_edx, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
     REGLIST2(RegX8632, edx, dx))                                               \
-  X(Reg_bh, 7, "bh", Reg_ebx, 0,1,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_bh, 7, "bh", Reg_ebx, 0,1,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
     REGLIST2(RegX8632, ebx, bx))                                               \
   /* End of 8-bit register set */
 //#define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,
@@ -212,22 +212,22 @@
 //#define X(val, emit)
 
 #define ICETYPEX8632_TABLE                                                     \
-  /* tag, element type, cvt , sdss, pack, width, fld */                        \
-  X(IceType_void,  IceType_void, "?",  "",   "",  "",  "")                     \
-  X(IceType_i1,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i8,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i16,   IceType_void, "si", "",   "",  "w", "")                     \
-  X(IceType_i32,   IceType_void, "si", "",   "",  "l", "")                     \
-  X(IceType_i64,   IceType_void, "si", "",   "",  "q", "")                     \
-  X(IceType_f32,   IceType_void, "ss", "ss", "d", "",  "s")                    \
-  X(IceType_f64,   IceType_void, "sd", "sd", "q", "",  "l")                    \
-  X(IceType_v4i1,  IceType_i32,  "?",  "",   "d", "",  "")                     \
-  X(IceType_v8i1,  IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v16i1, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v16i8, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v8i16, IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v4i32, IceType_i32,  "dq", "",   "d", "",  "")                     \
-  X(IceType_v4f32, IceType_f32,  "ps", "",   "d", "",  "")
+  /* tag,  element type, cvt , sdss, pack, width, fld */                       \
+  X(void,  void,         "?",  "",   "",   "",    "")                          \
+  X(i1,    void,         "si", "",   "",   "b",   "")                          \
+  X(i8,    void,         "si", "",   "",   "b",   "")                          \
+  X(i16,   void,         "si", "",   "",   "w",   "")                          \
+  X(i32,   void,         "si", "",   "",   "l",   "")                          \
+  X(i64,   void,         "si", "",   "",   "q",   "")                          \
+  X(f32,   void,         "ss", "ss", "d",  "",    "s")                         \
+  X(f64,   void,         "sd", "sd", "q",  "",    "l")                         \
+  X(v4i1,  i32,          "?",  "",   "d",  "",    "")                          \
+  X(v8i1,  i16,          "?",  "",   "w",  "",    "")                          \
+  X(v16i1, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v16i8, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v8i16, i16,          "?",  "",   "w",  "",    "")                          \
+  X(v4i32, i32,          "dq", "",   "d",  "",    "")                          \
+  X(v4f32, f32,          "ps", "",   "d",  "",    "")
 //#define X(tag, elementty, cvt, sdss, pack, width, fld)
 
 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
diff --git a/src/IceInstX8664.def b/src/IceInstX8664.def
index b3f452f..b3cdb40 100644
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -292,22 +292,22 @@
 //#define X(val, emit)
 
 #define ICETYPEX8664_TABLE                                                     \
-  /* tag         , element type, cvt , sdss, pack, width, fld */               \
-  X(IceType_void,  IceType_void, "?",  "",   "",  "",  "")                     \
-  X(IceType_i1,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i8,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i16,   IceType_void, "si", "",   "",  "w", "")                     \
-  X(IceType_i32,   IceType_void, "si", "",   "",  "l", "")                     \
-  X(IceType_i64,   IceType_void, "si", "",   "",  "q", "")                     \
-  X(IceType_f32,   IceType_void, "ss", "ss", "d", "",  "s")                    \
-  X(IceType_f64,   IceType_void, "sd", "sd", "q", "",  "l")                    \
-  X(IceType_v4i1,  IceType_i32,  "?",  "",   "d", "",  "")                     \
-  X(IceType_v8i1,  IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v16i1, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v16i8, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v8i16, IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v4i32, IceType_i32,  "dq", "",   "d", "",  "")                     \
-  X(IceType_v4f32, IceType_f32,  "ps", "",   "d", "",  "")
+  /* tag,  element type, cvt , sdss, pack, width, fld */                       \
+  X(void,  void,         "?",  "",   "",   "",    "")                          \
+  X(i1,    void,         "si", "",   "",   "b",   "")                          \
+  X(i8,    void,         "si", "",   "",   "b",   "")                          \
+  X(i16,   void,         "si", "",   "",   "w",   "")                          \
+  X(i32,   void,         "si", "",   "",   "l",   "")                          \
+  X(i64,   void,         "si", "",   "",   "q",   "")                          \
+  X(f32,   void,         "ss", "ss", "d",  "",    "s")                         \
+  X(f64,   void,         "sd", "sd", "q",  "",    "l")                         \
+  X(v4i1,  i32,          "?",  "",   "d",  "",    "")                          \
+  X(v8i1,  i16,          "?",  "",   "w",  "",    "")                          \
+  X(v16i1, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v16i8, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v8i16, i16,          "?",  "",   "w",  "",    "")                          \
+  X(v4i32, i32,          "dq", "",   "d",  "",    "")                          \
+  X(v4f32, f32,          "ps", "",   "d",  "",    "")
 //#define X(tag, elementty, cvt, sdss, pack, width, fld)
 
 #endif // SUBZERO_SRC_ICEINSTX8664_DEF
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 6428f53..e109bf2 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -1384,38 +1384,35 @@
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(this->getSrcSize() == 1);
   Operand *Src0 = this->getSrc(0);
-  assert(llvm::isa<Variable>(Src0));
+  int32_t DestReg = this->getDest()->getRegNum();
+  int32_t SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
   switch (Src0->getType()) {
   default:
     llvm_unreachable("unexpected source type!");
     break;
   case IceType_i8:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax ||
+           DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ah);
     Str << "\t"
         << "cbtw";
     break;
   case IceType_i16:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
     Str << "\t"
         << "cwtd";
     break;
   case IceType_i32:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Str << "\t"
         << "cltd";
     break;
   case IceType_i64:
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Str << "\t"
         << "cdto";
     break;
@@ -1428,35 +1425,32 @@
       Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
   assert(this->getSrcSize() == 1);
   Operand *Src0 = this->getSrc(0);
-  assert(llvm::isa<Variable>(Src0));
+  int32_t DestReg = this->getDest()->getRegNum();
+  int32_t SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
   switch (Src0->getType()) {
   default:
     llvm_unreachable("unexpected source type!");
     break;
   case IceType_i8:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax ||
+           DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ah);
     Asm->cbw();
     break;
   case IceType_i16:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
     Asm->cwd();
     break;
   case IceType_i32:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Asm->cdq();
     break;
   case IceType_i64:
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Asm->cqo();
     break;
   }
@@ -2278,32 +2272,29 @@
   } else {
     Str << "\tmov"
         << (!isScalarFloatingType(DestTy)
-                ? this->getWidthString(SrcTy)
+                ? this->getWidthString(DestTy)
                 : InstX86Base<Machine>::Traits::TypeAttributes[DestTy]
                       .SdSsString) << "\t";
   }
-  // For an integer truncation operation, src is wider than dest. Ideally, we
-  // use a mov instruction whose data width matches the narrower dest. This is
-  // a problem if e.g. src is a register like esi or si where there is no 8-bit
-  // version of the register. To be safe, we instead widen the dest to match
-  // src. This works even for stack-allocated dest variables because
-  // typeWidthOnStack() pads to a 4-byte boundary even if only a lower portion
-  // is used.
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
   // TODO: This assert disallows usages such as copying a floating
   // point value between a vector and a scalar (which movss is used for). Clean
   // this up.
   assert(Func->getTarget()->typeWidthInBytesOnStack(DestTy) ==
          Func->getTarget()->typeWidthInBytesOnStack(SrcTy));
-  Src->emit(Func);
+  const Operand *NewSrc = Src;
+  if (auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    int32_t NewRegNum = Variable::NoRegister;
+    if (SrcVar->hasReg())
+      NewRegNum = InstX86Base<Machine>::Traits::getGprForType(
+          DestTy, SrcVar->getRegNum());
+    if (SrcTy != DestTy)
+      NewSrc = SrcVar->asType(DestTy, NewRegNum);
+  }
+  NewSrc->emit(Func);
   Str << ", ";
-  int32_t NewRegNum = Variable::NoRegister;
-  if (this->getDest()->hasReg())
-    NewRegNum = InstX86Base<Machine>::Traits::getGprForType(
-        SrcTy, this->getDest()->getRegNum());
-  const Variable *NewDest = SrcTy == DestTy
-                                ? this->getDest()
-                                : this->getDest()->asType(SrcTy, NewRegNum);
-  NewDest->emit(Func);
+  this->getDest()->emit(Func);
 }
 
 template <class Machine>
@@ -2330,13 +2321,8 @@
       Machine>::Traits::Assembler::GPREmitterAddrOp GPRAddrEmitter = {
       &InstX86Base<Machine>::Traits::Assembler::mov,
       &InstX86Base<Machine>::Traits::Assembler::mov};
-  // For an integer truncation operation, src is wider than dest. Ideally, we
-  // use a mov instruction whose data width matches the narrower dest. This is
-  // a problem if e.g. src is a register like esi or si where there is no 8-bit
-  // version of the register. To be safe, we instead widen the dest to match
-  // src. This works even for stack-allocated dest variables because
-  // typeWidthOnStack() pads to a 4-byte boundary even if only a lower portion
-  // is used.
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
   // TODO: This assert disallows usages such as copying a floating
   // point value between a vector and a scalar (which movss is used for). Clean
   // this up.
@@ -2366,7 +2352,7 @@
         return;
       }
       if (isScalarIntegerType(SrcTy)) {
-        DestTy = SrcTy;
+        SrcTy = DestTy;
       }
       emitIASRegOpTyGPR<Machine>(Func, DestTy, Dest, Src, GPRRegEmitter);
       return;
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 0addc88..8ad78a5 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -428,6 +428,23 @@
 
 Ostream &operator<<(Ostream &Str, const LiveRange &L);
 
+/// RegClass indicates the physical register class that a Variable may be
+/// register-allocated from.  By default, a variable's register class is
+/// directly associated with its type.  However, the target lowering may define
+/// additional target-specific register classes by extending the set of enum
+/// values.
+enum RegClass : uint8_t {
+// Define RC_void, RC_i1, RC_i8, etc.
+#define X(tag, sizeLog2, align, elts, elty, str) RC_##tag = IceType_##tag,
+  ICETYPE_TABLE
+#undef X
+      RC_Target,
+  // Leave plenty of space for target-specific values.
+  RC_Max = std::numeric_limits<uint8_t>::max()
+};
+static_assert(RC_Target == static_cast<RegClass>(IceType_NUM),
+              "Expected RC_Target and IceType_NUM to be the same");
+
 /// Variable represents an operand that is register-allocated or
 /// stack-allocated. If it is register-allocated, it will ultimately have a
 /// non-negative RegNum field.
@@ -493,6 +510,9 @@
     return RegRequirement == RR_MustNotHaveRegister;
   }
 
+  void setRegClass(uint8_t RC) { RegisterClass = static_cast<RegClass>(RC); }
+  RegClass getRegClass() const { return RegisterClass; }
+
   LiveRange &getLiveRange() { return Live; }
   const LiveRange &getLiveRange() const { return Live; }
   void setLiveRange(const LiveRange &Range) { Live = Range; }
@@ -537,7 +557,8 @@
 
 protected:
   Variable(OperandKind K, Type Ty, SizeT Index)
-      : Operand(K, Ty), Number(Index) {
+      : Operand(K, Ty), Number(Index),
+        RegisterClass(static_cast<RegClass>(Ty)) {
     Vars = VarsReal;
     Vars[0] = this;
     NumVars = 1;
@@ -553,6 +574,7 @@
   /// pointer and other physical registers specifically referenced by name.
   bool IgnoreLiveness = false;
   RegRequirement RegRequirement = RR_MayHaveRegister;
+  RegClass RegisterClass;
   /// RegNum is the allocated register, or NoRegister if it isn't
   /// register-allocated.
   int32_t RegNum = NoRegister;
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 6b8656b..e6f3f56 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -833,8 +833,7 @@
     Iter.Cur = Unhandled.back();
     Unhandled.pop_back();
     dumpLiveRangeTrace("\nConsidering  ", Iter.Cur);
-    Iter.RegMask =
-        RegMaskFull & Target->getRegisterSetForType(Iter.Cur->getType());
+    Iter.RegMask = RegMaskFull & Target->getRegistersForVariable(Iter.Cur);
     KillsRange.trim(Iter.Cur->getLiveRange().getStart());
 
     // Check for pre-colored ranges. If Cur is pre-colored, it definitely gets
@@ -862,11 +861,10 @@
     // Disable AllowOverlap if an Active variable, which is not Prefer, shares
     // Prefer's register, and has a definition within Cur's live range.
     if (Iter.AllowOverlap) {
+      const llvm::SmallBitVector &Aliases = *RegAliases[Iter.PreferReg];
       for (const Variable *Item : Active) {
         int32_t RegNum = Item->getRegNumTmp();
-        // TODO(stichnot): Consider aliases of RegNum.  This is probably a
-        // correctness issue.
-        if (Item != Iter.Prefer && RegNum == Iter.PreferReg &&
+        if (Item != Iter.Prefer && Aliases[RegNum] &&
             overlapsDefs(Func, Iter.Cur, Item)) {
           Iter.AllowOverlap = false;
           dumpDisableOverlap(Func, Item, "Active");
diff --git a/src/IceRegistersARM32.h b/src/IceRegistersARM32.h
index a32ee4a..64dcf0d 100644
--- a/src/IceRegistersARM32.h
+++ b/src/IceRegistersARM32.h
@@ -17,6 +17,7 @@
 
 #include "IceDefs.h"
 #include "IceInstARM32.def"
+#include "IceOperand.h" // RC_Target
 #include "IceTypes.h"
 
 namespace Ice {
@@ -118,6 +119,9 @@
   static const char *RegNames[];
 };
 
+// Extend enum RegClass with ARM32-specific register classes (if any).
+enum RegClassARM32 : uint8_t { RCARM32_NUM = RC_Target };
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEREGISTERSARM32_H
diff --git a/src/IceRegistersMIPS32.h b/src/IceRegistersMIPS32.h
index df13f0e..38335f8 100644
--- a/src/IceRegistersMIPS32.h
+++ b/src/IceRegistersMIPS32.h
@@ -17,6 +17,7 @@
 
 #include "IceDefs.h"
 #include "IceInstMIPS32.def"
+#include "IceOperand.h" // RC_Target
 #include "IceTypes.h"
 
 namespace Ice {
@@ -59,6 +60,9 @@
 
 } // end of namespace RegMIPS32
 
+// Extend enum RegClass with MIPS32-specific register classes (if any).
+enum RegClassMIPS32 : uint8_t { RCMIPS32_NUM = RC_Target };
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEREGISTERSMIPS32_H
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index f518225..aff423c 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -234,7 +234,8 @@
 
   virtual llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                               RegSetMask Exclude) const = 0;
-  virtual const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const = 0;
+  virtual const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const = 0;
   virtual const llvm::SmallBitVector &getAliasesForRegister(SizeT) const = 0;
 
   void regAlloc(RegAllocKind Kind);
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 62a903d..f6029c4 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -74,8 +74,11 @@
   IceString getRegName(SizeT RegNum, Type Ty) const override;
   llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                       RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSet[RC];
   }
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
     return RegisterAliases[Reg];
@@ -554,7 +557,7 @@
   bool MaybeLeafFunc = true;
   size_t SpillAreaSizeBytes = 0;
   // TODO(jpp): std::array instead of array.
-  static llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
+  static llvm::SmallBitVector TypeToRegisterSet[RCARM32_NUM];
   static llvm::SmallBitVector RegisterAliases[RegARM32::Reg_NUM];
   static llvm::SmallBitVector ScratchRegs;
   llvm::SmallBitVector RegsUsed;
diff --git a/src/IceTargetLoweringMIPS32.h b/src/IceTargetLoweringMIPS32.h
index 3cd1687..c01c6c2 100644
--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -42,8 +42,11 @@
   IceString getRegName(SizeT RegNum, Type Ty) const override;
   llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                       RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSet[RC];
   }
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
     return RegisterAliases[Reg];
@@ -231,7 +234,7 @@
 
   bool UsesFramePointer = false;
   bool NeedsStackAlignment = false;
-  static llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
+  static llvm::SmallBitVector TypeToRegisterSet[RCMIPS32_NUM];
   static llvm::SmallBitVector RegisterAliases[RegMIPS32::Reg_NUM];
   static llvm::SmallBitVector ScratchRegs;
   llvm::SmallBitVector RegsUsed;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 6918cdf..2ecd77d 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -74,7 +74,7 @@
 const MachineTraits<TargetX8632>::TableTypeX8632AttributesType
     MachineTraits<TargetX8632>::TableTypeX8632Attributes[] = {
 #define X(tag, elementty, cvt, sdss, pack, width, fld)                         \
-  { elementty }                                                                \
+  { IceType_##elementty }                                                      \
   ,
         ICETYPEX8632_TABLE
 #undef X
@@ -87,7 +87,7 @@
 const char *MachineTraits<TargetX8632>::TargetName = "X8632";
 
 template <>
-std::array<llvm::SmallBitVector, IceType_NUM>
+std::array<llvm::SmallBitVector, RCX86_NUM>
     TargetX86Base<TargetX8632>::TypeToRegisterSet = {};
 
 template <>
@@ -957,7 +957,7 @@
 };
 // Define a set of constants based on high-level table entries.
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  static const int _table1_##tag = tag;
+  static const int _table1_##tag = IceType_##tag;
 ICETYPE_TABLE
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index 1c92bed..cd26702 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -22,8 +22,9 @@
 #include "IceInstX8632.def"
 #include "IceOperand.h"
 #include "IceRegistersX8632.h"
-#include "IceTargetLoweringX8632.def"
 #include "IceTargetLowering.h"
+#include "IceTargetLoweringX8632.def"
+#include "IceTargetLoweringX86RegClass.h"
 
 #include <array>
 
@@ -398,7 +399,7 @@
   }
 
   static void initRegisterSet(
-      std::array<llvm::SmallBitVector, IceType_NUM> *TypeToRegisterSet,
+      std::array<llvm::SmallBitVector, RCX86_NUM> *TypeToRegisterSet,
       std::array<llvm::SmallBitVector, RegisterSet::Reg_NUM> *RegisterAliases,
       llvm::SmallBitVector *ScratchRegs) {
     llvm::SmallBitVector IntegerRegistersI32(RegisterSet::Reg_NUM);
@@ -406,6 +407,11 @@
     llvm::SmallBitVector IntegerRegistersI8(RegisterSet::Reg_NUM);
     llvm::SmallBitVector FloatRegisters(RegisterSet::Reg_NUM);
     llvm::SmallBitVector VectorRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc64To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc32To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc16To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc8RcvrRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector AhRcvrRegisters(RegisterSet::Reg_NUM);
     llvm::SmallBitVector InvalidRegisters(RegisterSet::Reg_NUM);
     ScratchRegs->resize(RegisterSet::Reg_NUM);
 #define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,     \
@@ -416,6 +422,11 @@
   (IntegerRegistersI8)[RegisterSet::val] = is8;                                \
   (FloatRegisters)[RegisterSet::val] = isXmm;                                  \
   (VectorRegisters)[RegisterSet::val] = isXmm;                                 \
+  (Trunc64To8Registers)[RegisterSet::val] = is64To8;                           \
+  (Trunc32To8Registers)[RegisterSet::val] = is32To8;                           \
+  (Trunc16To8Registers)[RegisterSet::val] = is16To8;                           \
+  (Trunc8RcvrRegisters)[RegisterSet::val] = isTrunc8Rcvr;                      \
+  (AhRcvrRegisters)[RegisterSet::val] = isAhRcvr;                              \
   (*RegisterAliases)[RegisterSet::val].resize(RegisterSet::Reg_NUM);           \
   for (SizeT RegAlias : aliases) {                                             \
     assert(!(*RegisterAliases)[RegisterSet::val][RegAlias] &&                  \
@@ -427,21 +438,26 @@
     REGX8632_TABLE;
 #undef X
 
-    (*TypeToRegisterSet)[IceType_void] = InvalidRegisters;
-    (*TypeToRegisterSet)[IceType_i1] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i8] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i16] = IntegerRegistersI16;
-    (*TypeToRegisterSet)[IceType_i32] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_i64] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_f32] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_f64] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_v4i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i8] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i16] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4i32] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_void] = InvalidRegisters;
+    (*TypeToRegisterSet)[RC_i1] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i8] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i16] = IntegerRegistersI16;
+    (*TypeToRegisterSet)[RC_i32] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_i64] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_f32] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_f64] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_v4i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i8] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i16] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4i32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RCX86_Is64To8] = Trunc64To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is32To8] = Trunc32To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is16To8] = Trunc16To8Registers;
+    (*TypeToRegisterSet)[RCX86_IsTrunc8Rcvr] = Trunc8RcvrRegisters;
+    (*TypeToRegisterSet)[RCX86_IsAhRcvr] = AhRcvrRegisters;
   }
 
   static llvm::SmallBitVector
@@ -512,7 +528,12 @@
     Index |= (is8 << (AttrKey++));                                             \
     Index |= (is16 << (AttrKey++));                                            \
     Index |= (is32 << (AttrKey++));                                            \
+    Index |= (is64 << (AttrKey++));                                            \
     Index |= (isXmm << (AttrKey++));                                           \
+    Index |= (is16To8 << (AttrKey++));                                         \
+    Index |= (is32To8 << (AttrKey++));                                         \
+    Index |= (is64To8 << (AttrKey++));                                         \
+    Index |= (isTrunc8Rcvr << (AttrKey++));                                    \
     /* val is assigned to an equivalence class based on its properties. */     \
     EquivalenceClasses[Index].push_back(RegisterSet::val);                     \
   }
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 1783b26..bdebfbe 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -74,7 +74,7 @@
 const MachineTraits<TargetX8664>::TableTypeX8664AttributesType
     MachineTraits<TargetX8664>::TableTypeX8664Attributes[] = {
 #define X(tag, elementty, cvt, sdss, pack, width, fld)                         \
-  { elementty }                                                                \
+  { IceType_##elementty }                                                      \
   ,
         ICETYPEX8664_TABLE
 #undef X
@@ -87,7 +87,7 @@
 const char *MachineTraits<TargetX8664>::TargetName = "X8664";
 
 template <>
-std::array<llvm::SmallBitVector, IceType_NUM>
+std::array<llvm::SmallBitVector, RCX86_NUM>
     TargetX86Base<TargetX8664>::TypeToRegisterSet = {};
 
 template <>
@@ -955,7 +955,7 @@
 };
 // Define a set of constants based on high-level table entries.
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  static const int _table1_##tag = tag;
+  static const int _table1_##tag = IceType_##tag;
 ICETYPE_TABLE
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index 331f076..f6c834a 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -24,6 +24,7 @@
 #include "IceRegistersX8664.h"
 #include "IceTargetLowering.h"
 #include "IceTargetLoweringX8664.def"
+#include "IceTargetLoweringX86RegClass.h"
 
 #include <array>
 
@@ -379,7 +380,7 @@
   static int32_t getGprForType(Type, int32_t RegNum) { return RegNum; }
 
   static void initRegisterSet(
-      std::array<llvm::SmallBitVector, IceType_NUM> *TypeToRegisterSet,
+      std::array<llvm::SmallBitVector, RCX86_NUM> *TypeToRegisterSet,
       std::array<llvm::SmallBitVector, RegisterSet::Reg_NUM> *RegisterAliases,
       llvm::SmallBitVector *ScratchRegs) {
     llvm::SmallBitVector IntegerRegistersI64(RegisterSet::Reg_NUM);
@@ -388,6 +389,11 @@
     llvm::SmallBitVector IntegerRegistersI8(RegisterSet::Reg_NUM);
     llvm::SmallBitVector FloatRegisters(RegisterSet::Reg_NUM);
     llvm::SmallBitVector VectorRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc64To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc32To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc16To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc8RcvrRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector AhRcvrRegisters(RegisterSet::Reg_NUM);
     llvm::SmallBitVector InvalidRegisters(RegisterSet::Reg_NUM);
     ScratchRegs->resize(RegisterSet::Reg_NUM);
 
@@ -400,6 +406,11 @@
   (IntegerRegistersI8)[RegisterSet::val] = is8;                                \
   (FloatRegisters)[RegisterSet::val] = isXmm;                                  \
   (VectorRegisters)[RegisterSet::val] = isXmm;                                 \
+  (Trunc64To8Registers)[RegisterSet::val] = is64To8;                           \
+  (Trunc32To8Registers)[RegisterSet::val] = is32To8;                           \
+  (Trunc16To8Registers)[RegisterSet::val] = is16To8;                           \
+  (Trunc8RcvrRegisters)[RegisterSet::val] = isTrunc8Rcvr;                      \
+  (AhRcvrRegisters)[RegisterSet::val] = isAhRcvr;                              \
   (*RegisterAliases)[RegisterSet::val].resize(RegisterSet::Reg_NUM);           \
   for (SizeT RegAlias : aliases) {                                             \
     assert(!(*RegisterAliases)[RegisterSet::val][RegAlias] &&                  \
@@ -411,21 +422,26 @@
     REGX8664_TABLE;
 #undef X
 
-    (*TypeToRegisterSet)[IceType_void] = InvalidRegisters;
-    (*TypeToRegisterSet)[IceType_i1] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i8] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i16] = IntegerRegistersI16;
-    (*TypeToRegisterSet)[IceType_i32] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_i64] = IntegerRegistersI64;
-    (*TypeToRegisterSet)[IceType_f32] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_f64] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_v4i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i8] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i16] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4i32] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_void] = InvalidRegisters;
+    (*TypeToRegisterSet)[RC_i1] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i8] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i16] = IntegerRegistersI16;
+    (*TypeToRegisterSet)[RC_i32] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_i64] = IntegerRegistersI64;
+    (*TypeToRegisterSet)[RC_f32] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_f64] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_v4i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i8] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i16] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4i32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RCX86_Is64To8] = Trunc64To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is32To8] = Trunc32To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is16To8] = Trunc16To8Registers;
+    (*TypeToRegisterSet)[RCX86_IsTrunc8Rcvr] = Trunc8RcvrRegisters;
+    (*TypeToRegisterSet)[RCX86_IsAhRcvr] = AhRcvrRegisters;
   }
 
   static llvm::SmallBitVector
@@ -498,6 +514,10 @@
     Index |= (is32 << (AttrKey++));                                            \
     Index |= (is64 << (AttrKey++));                                            \
     Index |= (isXmm << (AttrKey++));                                           \
+    Index |= (is16To8 << (AttrKey++));                                         \
+    Index |= (is32To8 << (AttrKey++));                                         \
+    Index |= (is64To8 << (AttrKey++));                                         \
+    Index |= (isTrunc8Rcvr << (AttrKey++));                                    \
     /* val is assigned to an equivalence class based on its properties. */     \
     EquivalenceClasses[Index].push_back(RegisterSet::val);                     \
   }
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 1b52aad..bd09b5d 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -20,6 +20,7 @@
 #include "IceInst.h"
 #include "IceSwitchLowering.h"
 #include "IceTargetLowering.h"
+#include "IceTargetLoweringX86RegClass.h"
 #include "IceUtils.h"
 
 #include <array>
@@ -73,8 +74,11 @@
   IceString getRegName(SizeT RegNum, Type Ty) const override;
   llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                       RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSet[RC];
   }
 
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
@@ -263,6 +267,7 @@
   static Type firstTypeThatFitsSize(uint32_t Size,
                                     uint32_t MaxSize = NoSizeLimit);
 
+  Variable *copyToReg8(Operand *Src, int32_t RegNum = Variable::NoRegister);
   Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
 
   /// \name Returns a vector in a register with the given constant entries.
@@ -674,7 +679,7 @@
   bool NeedsStackAlignment = false;
   size_t SpillAreaSizeBytes = 0;
   size_t FixedAllocaSizeBytes = 0;
-  static std::array<llvm::SmallBitVector, IceType_NUM> TypeToRegisterSet;
+  static std::array<llvm::SmallBitVector, RCX86_NUM> TypeToRegisterSet;
   static std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
       RegisterAliases;
   static llvm::SmallBitVector ScratchRegs;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index b22ec64..56ee04d 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1216,8 +1216,7 @@
     //   t1:ecx = c.lo & 0xff
     //   t2 = b.lo
     //   t3 = b.hi
-    T_1 = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
-    _mov(T_1, Src1Lo);
+    T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
     _mov(T_2, Src0Lo);
     _mov(T_3, Src0Hi);
     switch (Op) {
@@ -1295,6 +1294,7 @@
 template <class Machine>
 void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
   Variable *Dest = Inst->getDest();
+  Type Ty = Dest->getType();
   Operand *Src0 = legalize(Inst->getSrc(0));
   Operand *Src1 = legalize(Inst->getSrc(1));
   if (Inst->isCommutative()) {
@@ -1316,7 +1316,7 @@
     assert(SwapCount <= 1);
     (void)SwapCount;
   }
-  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
     // These x86-32 helper-call-involved instructions are lowered in this
     // separate switch. This is because loOperand() and hiOperand() may insert
     // redundant instructions for constant blinding and pooling. Such redundant
@@ -1463,7 +1463,7 @@
     }
     return;
   }
-  if (isVectorType(Dest->getType())) {
+  if (isVectorType(Ty)) {
     // TODO: Trap on integer divide and integer modulo by zero. See:
     // https://code.google.com/p/nativeclient/issues/detail?id=3899
     if (llvm::isa<typename Traits::X86OperandMem>(Src1))
@@ -1473,46 +1473,45 @@
       llvm_unreachable("Unknown arithmetic operator");
       break;
     case InstArithmetic::Add: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _padd(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::And: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _pand(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Or: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _por(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Xor: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _pxor(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Sub: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _psub(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Mul: {
-      bool TypesAreValidForPmull =
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
+      bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
       bool InstructionSetIsValidForPmull =
-          Dest->getType() == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
+          Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
-        Variable *T = makeReg(Dest->getType());
+        Variable *T = makeReg(Ty);
         _movp(T, Src0);
         _pmull(T, Src0 == Src1 ? T : Src1);
         _movp(Dest, T);
-      } else if (Dest->getType() == IceType_v4i32) {
+      } else if (Ty == IceType_v4i32) {
         // Lowering sequence:
         // Note: The mask arguments have index 0 on the left.
         //
@@ -1550,7 +1549,7 @@
         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
         _movp(Dest, T4);
-      } else if (Dest->getType() == IceType_v16i8) {
+      } else if (Ty == IceType_v16i8) {
         scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
       } else {
         llvm::report_fatal_error("Invalid vector multiply type");
@@ -1566,25 +1565,25 @@
       scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
       break;
     case InstArithmetic::Fadd: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _addps(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Fsub: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _subps(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Fmul: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _mulps(T, Src0 == Src1 ? T : Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Fdiv: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _divps(T, Src1);
       _movp(Dest, T);
@@ -1633,13 +1632,13 @@
     }
     // The 8-bit version of imul only allows the form "imul r/m8" where T must
     // be in al.
-    if (isByteSizedArithType(Dest->getType())) {
+    if (isByteSizedArithType(Ty)) {
       _mov(T, Src0, Traits::RegisterSet::Reg_al);
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
       _imul(T, Src0 == Src1 ? T : Src1);
       _mov(Dest, T);
     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-      T = makeReg(Dest->getType());
+      T = makeReg(Ty);
       _imul_imm(T, Src0, ImmConst);
       _mov(Dest, T);
     } else {
@@ -1650,76 +1649,51 @@
     break;
   case InstArithmetic::Shl:
     _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1)) {
-      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
-      _mov(Cl, Src1);
-      Src1 = Cl;
-    }
+    if (!llvm::isa<ConstantInteger32>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
     _shl(T, Src1);
     _mov(Dest, T);
     break;
   case InstArithmetic::Lshr:
     _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1)) {
-      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
-      _mov(Cl, Src1);
-      Src1 = Cl;
-    }
+    if (!llvm::isa<ConstantInteger32>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
     _shr(T, Src1);
     _mov(Dest, T);
     break;
   case InstArithmetic::Ashr:
     _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1)) {
-      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
-      _mov(Cl, Src1);
-      Src1 = Cl;
-    }
+    if (!llvm::isa<ConstantInteger32>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
     _sar(T, Src1);
     _mov(Dest, T);
     break;
-  case InstArithmetic::Udiv:
+  case InstArithmetic::Udiv: {
     // div and idiv are the few arithmetic operators that do not allow
     // immediates as the operand.
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (isByteSizedArithType(Dest->getType())) {
-      // For 8-bit unsigned division we need to zero-extend al into ah. A mov
-      // $0, %ah (or xor %ah, %ah) would work just fine, except that the x86-64
-      // assembler refuses to encode %ah (encoding %spl with a REX prefix
-      // instead.) Accessing %ah in 64-bit is "tricky" as you can't encode %ah
-      // with any other 8-bit register except for %a[lh], %b[lh], %c[lh], and
-      // d[%lh], which means the X86 target lowering (and the register
-      // allocator) would have to be aware of this restriction. For now, we
-      // simply zero %eax completely, and move the dividend into %al.
-      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      Context.insert(InstFakeDef::create(Func, T_eax));
-      _xor(T_eax, T_eax);
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      _div(T, Src1, T);
-      _mov(Dest, T);
-      Context.insert(InstFakeUse::create(Func, T_eax));
-    } else {
-      Type Ty = Dest->getType();
-      uint32_t Eax = Traits::RegisterSet::Reg_eax;
-      uint32_t Edx = Traits::RegisterSet::Reg_edx;
-      switch (Ty) {
-      default:
-        llvm_unreachable("Bad type for udiv");
-      // fallthrough
-      case IceType_i32:
-        break;
-      case IceType_i16:
-        Eax = Traits::RegisterSet::Reg_ax;
-        Edx = Traits::RegisterSet::Reg_dx;
-        break;
-      }
-      Constant *Zero = Ctx->getConstantZero(Ty);
-      _mov(T, Src0, Eax);
-      _mov(T_edx, Zero, Edx);
-      _div(T, Src1, T_edx);
-      _mov(Dest, T);
+    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    switch (Ty) {
+    default:
+      llvm_unreachable("Bad type for udiv");
+    // fallthrough
+    case IceType_i32:
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
     }
-    break;
+    _mov(T, Src0, Eax);
+    _mov(T_edx, Ctx->getConstantZero(Ty), Edx);
+    _div(T, Src1, T_edx);
+    _mov(Dest, T);
+  } break;
   case InstArithmetic::Sdiv:
     // TODO(stichnot): Enable this after doing better performance and cross
     // testing.
@@ -1731,7 +1705,6 @@
         uint32_t UDivisor = static_cast<uint32_t>(Divisor);
         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
           uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          Type Ty = Dest->getType();
           // LLVM does the following for dest=src/(1<<log):
           //   t=src
           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
@@ -1757,7 +1730,7 @@
       }
     }
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    switch (Type Ty = Dest->getType()) {
+    switch (Ty) {
     default:
       llvm_unreachable("Bad type for sdiv");
     // fallthrough
@@ -1778,47 +1751,32 @@
     _idiv(T, Src1, T_edx);
     _mov(Dest, T);
     break;
-  case InstArithmetic::Urem:
+  case InstArithmetic::Urem: {
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (isByteSizedArithType(Dest->getType())) {
-      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      Context.insert(InstFakeDef::create(Func, T_eax));
-      _xor(T_eax, T_eax);
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      _div(T, Src1, T);
-      // shr $8, %eax shifts ah (i.e., the 8 bit remainder) into al. We don't
-      // mov %ah, %al because it would make x86-64 codegen more complicated. If
-      // this ever becomes a problem we can introduce a pseudo rem instruction
-      // that returns the remainder in %al directly (and uses a mov for copying
-      // %ah to %al.)
-      static constexpr uint8_t AlSizeInBits = 8;
-      _shr(T_eax, Ctx->getConstantInt8(AlSizeInBits));
-      _mov(Dest, T);
-      Context.insert(InstFakeUse::create(Func, T_eax));
-    } else {
-      Type Ty = Dest->getType();
-      uint32_t Eax = Traits::RegisterSet::Reg_eax;
-      uint32_t Edx = Traits::RegisterSet::Reg_edx;
-      switch (Ty) {
-      default:
-        llvm_unreachable("Bad type for urem");
-      // fallthrough
-      case IceType_i32:
-        break;
-      case IceType_i16:
-        Eax = Traits::RegisterSet::Reg_ax;
-        Edx = Traits::RegisterSet::Reg_dx;
-        break;
-      }
-      Constant *Zero = Ctx->getConstantZero(Ty);
-      T_edx = makeReg(Dest->getType(), Edx);
-      _mov(T_edx, Zero);
-      _mov(T, Src0, Eax);
-      _div(T_edx, Src1, T);
-      _mov(Dest, T_edx);
+    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    switch (Ty) {
+    default:
+      llvm_unreachable("Bad type for urem");
+    // fallthrough
+    case IceType_i32:
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
     }
-    break;
-  case InstArithmetic::Srem:
+    T_edx = makeReg(Ty, Edx);
+    _mov(T_edx, Ctx->getConstantZero(Ty));
+    _mov(T, Src0, Eax);
+    _div(T_edx, Src1, T);
+    _mov(Dest, T_edx);
+  } break;
+  case InstArithmetic::Srem: {
     // TODO(stichnot): Enable this after doing better performance and cross
     // testing.
     if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
@@ -1829,7 +1787,6 @@
         uint32_t UDivisor = static_cast<uint32_t>(Divisor);
         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
           uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          Type Ty = Dest->getType();
           // LLVM does the following for dest=src%(1<<log):
           //   t=src
           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
@@ -1860,37 +1817,29 @@
       }
     }
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    switch (Type Ty = Dest->getType()) {
+    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    switch (Ty) {
     default:
       llvm_unreachable("Bad type for srem");
     // fallthrough
     case IceType_i32:
-      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
-      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
-      _cbwdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      _mov(Dest, T_edx);
       break;
     case IceType_i16:
-      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
-      _mov(T, Src0, Traits::RegisterSet::Reg_ax);
-      _cbwdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      _mov(Dest, T_edx);
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
       break;
     case IceType_i8:
-      T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
-      // TODO(stichnot): Use register ah for T_edx, and remove the _shr().
-      // T_edx = makeReg(Ty, Traits::RegisterSet::Reg_ah);
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      _cbwdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      static constexpr uint8_t AlSizeInBits = 8;
-      _shr(T_edx, Ctx->getConstantInt8(AlSizeInBits));
-      _mov(Dest, T_edx);
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
       break;
     }
-    break;
+    T_edx = makeReg(Ty, Edx);
+    _mov(T, Src0, Eax);
+    _cbwdq(T_edx, T);
+    _idiv(T_edx, Src1, T);
+    _mov(Dest, T_edx);
+  } break;
   case InstArithmetic::Fadd:
     _mov(T, Src0);
     _addss(T, Src1);
@@ -1913,7 +1862,6 @@
     break;
   case InstArithmetic::Frem: {
     constexpr SizeT MaxSrcs = 2;
-    Type Ty = Dest->getType();
     InstCall *Call = makeHelperCall(
         isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
     Call->addArg(Src0);
@@ -1991,6 +1939,7 @@
   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
   InstCast::OpKind CastKind = Inst->getCastKind();
   Variable *Dest = Inst->getDest();
+  Type DestTy = Dest->getType();
   switch (CastKind) {
   default:
     Func->setError("Cast type not supported");
@@ -2003,15 +1952,14 @@
     // we're unlikely to see something like that in the bitcode that the
     // optimizer wouldn't have already taken care of.
     Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(Dest->getType())) {
-      Type DestTy = Dest->getType();
+    if (isVectorType(DestTy)) {
       if (DestTy == IceType_v16i8) {
         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
-        Variable *OneMask = makeVectorOfOnes(Dest->getType());
+        Variable *OneMask = makeVectorOfOnes(DestTy);
         Variable *T = makeReg(DestTy);
         _movp(T, Src0RM);
         _pand(T, OneMask);
-        Variable *Zeros = makeVectorOfZeros(Dest->getType());
+        Variable *Zeros = makeVectorOfZeros(DestTy);
         _pcmpgt(T, Zeros);
         _movp(Dest, T);
       } else {
@@ -2026,7 +1974,7 @@
         _psra(T, ShiftConstant);
         _movp(Dest, T);
       }
-    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
       Constant *Shift = Ctx->getConstantInt32(31);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -2053,12 +2001,10 @@
       // shl t1, dst_bitwidth - 1
       // sar t1, dst_bitwidth - 1
       // dst = t1
-      size_t DestBits =
-          Traits::X86_CHAR_BIT * typeWidthInBytes(Dest->getType());
+      size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
-      Variable *T = makeReg(Dest->getType());
-      if (typeWidthInBytes(Dest->getType()) <=
-          typeWidthInBytes(Src0RM->getType())) {
+      Variable *T = makeReg(DestTy);
+      if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
         _mov(T, Src0RM);
       } else {
         // Widen the source using movsx or movzx. (It doesn't matter which one,
@@ -2070,7 +2016,7 @@
       _mov(Dest, T);
     } else {
       // t1 = movsx src; dst = t1
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _movsx(T, Src0RM);
       _mov(Dest, T);
     }
@@ -2078,15 +2024,14 @@
   }
   case InstCast::Zext: {
     Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(Dest->getType())) {
+    if (isVectorType(DestTy)) {
       // onemask = materialize(1,1,...); dest = onemask & src
-      Type DestTy = Dest->getType();
       Variable *OneMask = makeVectorOfOnes(DestTy);
       Variable *T = makeReg(DestTy);
       _movp(T, Src0RM);
       _pand(T, OneMask);
       _movp(Dest, T);
-    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
       // t1=movzx src; dst.lo=t1; dst.hi=0
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -2101,7 +2046,6 @@
       _mov(DestHi, Zero);
     } else if (Src0RM->getType() == IceType_i1) {
       // t = Src0RM; Dest = t
-      Type DestTy = Dest->getType();
       Variable *T = nullptr;
       if (DestTy == IceType_i8) {
         _mov(T, Src0RM);
@@ -2117,32 +2061,40 @@
       _mov(Dest, T);
     } else {
       // t1 = movzx src; dst = t1
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _movzx(T, Src0RM);
       _mov(Dest, T);
     }
     break;
   }
   case InstCast::Trunc: {
-    if (isVectorType(Dest->getType())) {
+    if (isVectorType(DestTy)) {
       // onemask = materialize(1,1,...); dst = src & onemask
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       Type Src0Ty = Src0RM->getType();
       Variable *OneMask = makeVectorOfOnes(Src0Ty);
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _movp(T, Src0RM);
       _pand(T, OneMask);
       _movp(Dest, T);
+    } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
+      // Make sure we truncate from and into valid registers.
+      Operand *Src0 = legalizeUndef(Inst->getSrc(0));
+      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
+        Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      Variable *T = copyToReg8(Src0RM);
+      if (DestTy == IceType_i1)
+        _and(T, Ctx->getConstantInt1(1));
+      _mov(Dest, T);
     } else {
       Operand *Src0 = legalizeUndef(Inst->getSrc(0));
       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
         Src0 = loOperand(Src0);
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
       // t1 = trunc Src0RM; Dest = t1
-      Variable *T = nullptr;
+      Variable *T = makeReg(DestTy);
       _mov(T, Src0RM);
-      if (Dest->getType() == IceType_i1)
-        _and(T, Ctx->getConstantInt1(1));
       _mov(Dest, T);
     }
     break;
@@ -2151,22 +2103,22 @@
   case InstCast::Fpext: {
     Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
     // t1 = cvt Src0RM; Dest = t1
-    Variable *T = makeReg(Dest->getType());
+    Variable *T = makeReg(DestTy);
     _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
     _mov(Dest, T);
     break;
   }
   case InstCast::Fptosi:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4i32 &&
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4i32 &&
              Inst->getSrc(0)->getType() == IceType_v4f32);
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       if (llvm::isa<typename Traits::X86OperandMem>(Src0RM))
         Src0RM = legalizeToReg(Src0RM);
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq);
       _movp(Dest, T);
-    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
       constexpr SizeT MaxSrcs = 1;
       Type SrcType = Inst->getSrc(0)->getType();
       InstCall *Call =
@@ -2179,40 +2131,44 @@
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
       Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && Dest->getType() == IceType_i64) {
+      if (Traits::Is64Bit && DestTy == IceType_i64) {
         T_1 = makeReg(IceType_i64);
       } else {
-        assert(Dest->getType() != IceType_i64);
+        assert(DestTy != IceType_i64);
         T_1 = makeReg(IceType_i32);
       }
       // cvt() requires its integer argument to be a GPR.
-      Variable *T_2 = makeReg(Dest->getType());
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (Dest->getType() == IceType_i1)
+      if (DestTy == IceType_i1)
         _and(T_2, Ctx->getConstantInt1(1));
       _mov(Dest, T_2);
     }
     break;
   case InstCast::Fptoui:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4i32 &&
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4i32 &&
              Inst->getSrc(0)->getType() == IceType_v4f32);
       constexpr SizeT MaxSrcs = 1;
       InstCall *Call = makeHelperCall(H_fptoui_4xi32_f32, Dest, MaxSrcs);
       Call->addArg(Inst->getSrc(0));
       lowerCall(Call);
-    } else if (Dest->getType() == IceType_i64 ||
-               (!Traits::Is64Bit && Dest->getType() == IceType_i32)) {
+    } else if (DestTy == IceType_i64 ||
+               (!Traits::Is64Bit && DestTy == IceType_i32)) {
       // Use a helper for both x86-32 and x86-64.
       constexpr SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
       Type SrcType = Inst->getSrc(0)->getType();
       IceString TargetString;
       if (Traits::Is64Bit) {
         TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
                                                          : H_fptoui_f64_i64;
-      } else if (isInt32Asserting32Or64(DestType)) {
+      } else if (isInt32Asserting32Or64(DestTy)) {
         TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
                                                          : H_fptoui_f64_i32;
       } else {
@@ -2226,39 +2182,43 @@
     } else {
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      assert(Dest->getType() != IceType_i64);
+      assert(DestTy != IceType_i64);
       Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && Dest->getType() == IceType_i32) {
+      if (Traits::Is64Bit && DestTy == IceType_i32) {
         T_1 = makeReg(IceType_i64);
       } else {
-        assert(Dest->getType() != IceType_i32);
+        assert(DestTy != IceType_i32);
         T_1 = makeReg(IceType_i32);
       }
-      Variable *T_2 = makeReg(Dest->getType());
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (Dest->getType() == IceType_i1)
+      if (DestTy == IceType_i1)
         _and(T_2, Ctx->getConstantInt1(1));
       _mov(Dest, T_2);
     }
     break;
   case InstCast::Sitofp:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4f32 &&
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4f32 &&
              Inst->getSrc(0)->getType() == IceType_v4i32);
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       if (llvm::isa<typename Traits::X86OperandMem>(Src0RM))
         Src0RM = legalizeToReg(Src0RM);
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _cvt(T, Src0RM, Traits::Insts::Cvt::Dq2ps);
       _movp(Dest, T);
     } else if (!Traits::Is64Bit && Inst->getSrc(0)->getType() == IceType_i64) {
       // Use a helper for x86-32.
       constexpr SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
       InstCall *Call =
-          makeHelperCall(isFloat32Asserting32Or64(DestType) ? H_sitofp_i64_f32
-                                                            : H_sitofp_i64_f64,
+          makeHelperCall(isFloat32Asserting32Or64(DestTy) ? H_sitofp_i64_f32
+                                                          : H_sitofp_i64_f64,
                          Dest, MaxSrcs);
       // TODO: Call the correct compiler-rt helper function.
       Call->addArg(Inst->getSrc(0));
@@ -2275,7 +2235,7 @@
         assert(Src0RM->getType() != IceType_i64);
         T_1 = makeReg(IceType_i32);
       }
-      Variable *T_2 = makeReg(Dest->getType());
+      Variable *T_2 = makeReg(DestTy);
       if (Src0RM->getType() == T_1->getType())
         _mov(T_1, Src0RM);
       else
@@ -2287,8 +2247,7 @@
   case InstCast::Uitofp: {
     Operand *Src0 = Inst->getSrc(0);
     if (isVectorType(Src0->getType())) {
-      assert(Dest->getType() == IceType_v4f32 &&
-             Src0->getType() == IceType_v4i32);
+      assert(DestTy == IceType_v4f32 && Src0->getType() == IceType_v4i32);
       constexpr SizeT MaxSrcs = 1;
       InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs);
       Call->addArg(Src0);
@@ -2298,14 +2257,13 @@
       // Use a helper for x86-32 and x86-64. Also use a helper for i32 on
       // x86-32.
       constexpr SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
       IceString TargetString;
       if (isInt32Asserting32Or64(Src0->getType())) {
-        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32
-                                                          : H_uitofp_i32_f64;
+        TargetString = isFloat32Asserting32Or64(DestTy) ? H_uitofp_i32_f32
+                                                        : H_uitofp_i32_f64;
       } else {
-        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32
-                                                          : H_uitofp_i64_f64;
+        TargetString = isFloat32Asserting32Or64(DestTy) ? H_uitofp_i64_f32
+                                                        : H_uitofp_i64_f64;
       }
       InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
       Call->addArg(Src0);
@@ -2323,7 +2281,7 @@
         assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
         T_1 = makeReg(IceType_i32);
       }
-      Variable *T_2 = makeReg(Dest->getType());
+      Variable *T_2 = makeReg(DestTy);
       if (Src0RM->getType() == T_1->getType())
         _mov(T_1, Src0RM);
       else
@@ -2335,12 +2293,12 @@
   }
   case InstCast::Bitcast: {
     Operand *Src0 = Inst->getSrc(0);
-    if (Dest->getType() == Src0->getType()) {
+    if (DestTy == Src0->getType()) {
       InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
       lowerAssign(Assign);
       return;
     }
-    switch (Dest->getType()) {
+    switch (DestTy) {
     default:
       llvm_unreachable("Unexpected Bitcast dest type");
     case IceType_i8: {
@@ -2358,11 +2316,9 @@
     case IceType_i32:
     case IceType_f32: {
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      Type DestType = Dest->getType();
       Type SrcType = Src0RM->getType();
-      (void)DestType;
-      assert((DestType == IceType_i32 && SrcType == IceType_f32) ||
-             (DestType == IceType_f32 && SrcType == IceType_i32));
+      assert((DestTy == IceType_i32 && SrcType == IceType_f32) ||
+             (DestTy == IceType_f32 && SrcType == IceType_i32));
       // a.i32 = bitcast b.f32 ==>
       //   t.f32 = b.f32
       //   s.f32 = spill t.f32
@@ -2436,7 +2392,7 @@
       } else {
         Src0 = legalize(Src0);
         if (llvm::isa<typename Traits::X86OperandMem>(Src0)) {
-          Variable *T = Func->makeVariable(Dest->getType());
+          Variable *T = Func->makeVariable(DestTy);
           _movq(T, Src0);
           _movq(Dest, T);
           break;
@@ -3037,17 +2993,21 @@
         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
     Variable *T = makeReg(Ty);
     _movp(T, SourceVectRM);
-    if (Ty == IceType_v4f32)
+    if (Ty == IceType_v4f32) {
       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
-    else
-      // TODO(stichnot): For the pinsrb and pinsrw instructions, when the source
-      // operand is a register, it must be a full r32 register like eax, and not
-      // ax/al/ah.  For filetype=asm, InstX86Pinsr<Machine>::emit() compensates
-      // for the use of r16 and r8 by converting them through getBaseReg(),
-      // while emitIAS() validates that the original and base register encodings
-      // are the same.  But for an "interior" register like ah, it should
-      // probably be copied into an r32 via movzx so that the types work out.
+    } else {
+      // For the pinsrb and pinsrw instructions, when the source operand is a
+      // register, it must be a full r32 register like eax, and not ax/al/ah.
+      // For filetype=asm, InstX86Pinsr<Machine>::emit() compensates for the use
+      // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
+      // validates that the original and base register encodings are the same.
+      if (ElementRM->getType() == IceType_i8 &&
+          llvm::isa<Variable>(ElementRM)) {
+        // Don't use ah/bh/ch/dh for pinsrb.
+        ElementRM = copyToReg8(ElementRM);
+      }
       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
+    }
     _movp(Inst->getDest(), T);
   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
     // Use shufps or movss.
@@ -5354,6 +5314,67 @@
   return Traits::X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
 }
 
+/// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
+/// Src is assumed to already be legalized.  If the source operand is known to
+/// be a memory or immediate operand, a simple mov will suffice.  But if the
+/// source operand can be a physical register, then it must first be copied into
+/// a physical register that is truncable to 8-bit, then truncated into a
+/// physical register that can receive a truncation, and finally copied into the
+/// result 8-bit register (which in general can be any 8-bit register).  For
+/// example, moving %ebp into %ah may be accomplished as:
+///   movl %ebp, %edx
+///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
+///   movb %dl, %ah
+/// On the other hand, moving a memory or immediate operand into ah:
+///   movb 4(%ebp), %ah
+///   movb $my_imm, %ah
+///
+/// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
+/// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
+/// use RegNum=NoRegister and then let the caller do a separate copy into
+/// Reg_ah.
+///
+/// Note #2.  ConstantRelocatable operands are also put through this process
+/// (not truncated directly) because our ELF emitter does R_386_32 relocations
+/// but not R_386_8 relocations.
+///
+/// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
+/// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
+/// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
+/// to the pinsrb instruction.
+template <class Machine>
+Variable *TargetX86Base<Machine>::copyToReg8(Operand *Src, int32_t RegNum) {
+  Type Ty = Src->getType();
+  assert(isScalarIntegerType(Ty));
+  assert(Ty != IceType_i1);
+  Variable *Reg = makeReg(IceType_i8, RegNum);
+  Reg->setRegClass(RCX86_IsTrunc8Rcvr);
+  if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
+    Variable *SrcTruncable = makeReg(Ty);
+    switch (Ty) {
+    case IceType_i64:
+      SrcTruncable->setRegClass(RCX86_Is64To8);
+      break;
+    case IceType_i32:
+      SrcTruncable->setRegClass(RCX86_Is32To8);
+      break;
+    case IceType_i16:
+      SrcTruncable->setRegClass(RCX86_Is16To8);
+      break;
+    default:
+      // i8 - just use default register class
+      break;
+    }
+    Variable *SrcRcvr = makeReg(IceType_i8);
+    SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
+    _mov(SrcTruncable, Src);
+    _mov(SrcRcvr, SrcTruncable);
+    Src = SrcRcvr;
+  }
+  _mov(Reg, Src);
+  return Reg;
+}
+
 /// Helper for legalize() to emit the right code to lower an operand to a
 /// register of the appropriate type.
 template <class Machine>
diff --git a/src/IceTargetLoweringX86RegClass.h b/src/IceTargetLoweringX86RegClass.h
new file mode 100644
index 0000000..35ae316
--- /dev/null
+++ b/src/IceTargetLoweringX86RegClass.h
@@ -0,0 +1,36 @@
+//===- subzero/src/IceTargetLoweringX86RegClass.h - x86 reg class -*- C++ -*-=//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the X86 register class extensions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
+
+#include "IceOperand.h" // RC_Target
+
+namespace Ice {
+namespace X86Internal {
+
+// Extend enum RegClass with x86-specific register classes.
+enum RegClassX86 : uint8_t {
+  RCX86_Is64To8 = RC_Target, // 64-bit GPR trivially truncable to 8-bit
+  RCX86_Is32To8,             // 32-bit GPR trivially truncable to 8-bit
+  RCX86_Is16To8,             // 16-bit GPR trivially truncable to 8-bit
+  RCX86_IsTrunc8Rcvr,        // 8-bit GPR that can receive a trunc operation
+  RCX86_IsAhRcvr,            // 8-bit GPR that can be a mov dest from %ah
+  RCX86_NUM
+};
+
+} // end of namespace X86Internal
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
diff --git a/src/IceTypes.cpp b/src/IceTypes.cpp
index 3756a10..d51d0ba 100644
--- a/src/IceTypes.cpp
+++ b/src/IceTypes.cpp
@@ -95,7 +95,7 @@
 
 const struct TypeAttributeFields TypeAttributes[] = {
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  { sizeLog2, align, elts, elty, str }                                         \
+  { sizeLog2, align, elts, IceType_##elty, str }                               \
   ,
     ICETYPE_TABLE
 #undef X
@@ -120,7 +120,8 @@
           CompareResult)                                                       \
   {                                                                            \
     IsVec, IsInt, IsInt & !IsVec, IsInt & IsVec, IsIntArith, IsFloat,          \
-        IsFloat & !IsVec, IsFloat & IsVec, IsLoadStore, IsParam, CompareResult \
+        IsFloat & !IsVec, IsFloat & IsVec, IsLoadStore, IsParam,               \
+        IceType_##CompareResult                                                \
   }                                                                            \
   ,
     ICETYPE_PROPS_TABLE
diff --git a/src/IceTypes.def b/src/IceTypes.def
index 8db8c63..8149596 100644
--- a/src/IceTypes.def
+++ b/src/IceTypes.def
@@ -29,23 +29,23 @@
 //#define X(tag, str, is_elf64, e_machine, e_flags)
 
 #define ICETYPE_TABLE                                                          \
-  /* enum value, log_2(size), align, # elts, element type, printable */        \
-  /*     string (size and alignment in bytes) */                               \
-  X(IceType_void,  -1,  0,     1,      IceType_void, "void")                   \
-  X(IceType_i1,     0,  1,     1,      IceType_i1,   "i1")                     \
-  X(IceType_i8,     0,  1,     1,      IceType_i8,   "i8")                     \
-  X(IceType_i16,    1,  1,     1,      IceType_i16,  "i16")                    \
-  X(IceType_i32,    2,  1,     1,      IceType_i32,  "i32")                    \
-  X(IceType_i64,    3,  1,     1,      IceType_i64,  "i64")                    \
-  X(IceType_f32,    2,  4,     1,      IceType_f32,  "float")                  \
-  X(IceType_f64,    3,  8,     1,      IceType_f64,  "double")                 \
-  X(IceType_v4i1,   4,  1,     4,      IceType_i1,   "<4 x i1>")               \
-  X(IceType_v8i1,   4,  1,     8,      IceType_i1,   "<8 x i1>")               \
-  X(IceType_v16i1,  4,  1,    16,      IceType_i1,   "<16 x i1>")              \
-  X(IceType_v16i8,  4,  1,    16,      IceType_i8,   "<16 x i8>")              \
-  X(IceType_v8i16,  4,  2,     8,      IceType_i16,  "<8 x i16>")              \
-  X(IceType_v4i32,  4,  4,     4,      IceType_i32,  "<4 x i32>")              \
-  X(IceType_v4f32,  4,  4,     4,      IceType_f32,  "<4 x float>")            \
+  /* enum value, log_2(size), align, # elts, element type, */                  \
+  /*     printable string (size and alignment in bytes) */                     \
+  X(void,  -1,  0,  1, void, "void")                                           \
+  X(i1,     0,  1,  1, i1,   "i1")                                             \
+  X(i8,     0,  1,  1, i8,   "i8")                                             \
+  X(i16,    1,  1,  1, i16,  "i16")                                            \
+  X(i32,    2,  1,  1, i32,  "i32")                                            \
+  X(i64,    3,  1,  1, i64,  "i64")                                            \
+  X(f32,    2,  4,  1, f32,  "float")                                          \
+  X(f64,    3,  8,  1, f64,  "double")                                         \
+  X(v4i1,   4,  1,  4, i1,   "<4 x i1>")                                       \
+  X(v8i1,   4,  1,  8, i1,   "<8 x i1>")                                       \
+  X(v16i1,  4,  1, 16, i1,   "<16 x i1>")                                      \
+  X(v16i8,  4,  1, 16, i8,   "<16 x i8>")                                      \
+  X(v8i16,  4,  2,  8, i16,  "<8 x i16>")                                      \
+  X(v4i32,  4,  4,  4, i32,  "<4 x i32>")                                      \
+  X(v4f32,  4,  4,  4, f32,  "<4 x float>")                                    \
 //#define X(tag, sizeLog2, align, elts, elty, str)
 
 // Dictionary:
@@ -58,22 +58,22 @@
 //   CR - Result type of compare instruction for argument type
 //        (IceType_void if disallowed)
 #define ICETYPE_PROPS_TABLE                                                    \
-  /* Enum Value    V  I  F IA  LS P CR */                                      \
-  X(IceType_void,  0, 0, 0, 0, 0, 0, IceType_void)                             \
-  X(IceType_i1,    0, 1, 0, 0, 0, 0, IceType_i1)                               \
-  X(IceType_i8,    0, 1, 0, 1, 1, 0, IceType_i1)                               \
-  X(IceType_i16,   0, 1, 0, 1, 1, 0, IceType_i1)                               \
-  X(IceType_i32,   0, 1, 0, 1, 1, 1, IceType_i1)                               \
-  X(IceType_i64,   0, 1, 0, 1, 1, 1, IceType_i1)                               \
-  X(IceType_f32,   0, 0, 1, 0, 1, 1, IceType_i1)                               \
-  X(IceType_f64,   0, 0, 1, 0, 1, 1, IceType_i1)                               \
-  X(IceType_v4i1,  1, 1, 0, 0, 0, 1, IceType_v4i1)                             \
-  X(IceType_v8i1,  1, 1, 0, 0, 0, 1, IceType_v8i1)                             \
-  X(IceType_v16i1, 1, 1, 0, 0, 0, 1, IceType_v16i1)                            \
-  X(IceType_v16i8, 1, 1, 0, 1, 1, 1, IceType_v16i1)                            \
-  X(IceType_v8i16, 1, 1, 0, 1, 1, 1, IceType_v8i1)                             \
-  X(IceType_v4i32, 1, 1, 0, 1, 1, 1, IceType_v4i1)                             \
-  X(IceType_v4f32, 1, 0, 1, 0, 1, 1, IceType_v4i1)                             \
+  /* Enum Value    V  I  F IA LS  P  CR */                                     \
+  X(void,          0, 0, 0, 0, 0, 0, void)                                     \
+  X(i1,            0, 1, 0, 0, 0, 0, i1)                                       \
+  X(i8,            0, 1, 0, 1, 1, 0, i1)                                       \
+  X(i16,           0, 1, 0, 1, 1, 0, i1)                                       \
+  X(i32,           0, 1, 0, 1, 1, 1, i1)                                       \
+  X(i64,           0, 1, 0, 1, 1, 1, i1)                                       \
+  X(f32,           0, 0, 1, 0, 1, 1, i1)                                       \
+  X(f64,           0, 0, 1, 0, 1, 1, i1)                                       \
+  X(v4i1,          1, 1, 0, 0, 0, 1, v4i1)                                     \
+  X(v8i1,          1, 1, 0, 0, 0, 1, v8i1)                                     \
+  X(v16i1,         1, 1, 0, 0, 0, 1, v16i1)                                    \
+  X(v16i8,         1, 1, 0, 1, 1, 1, v16i1)                                    \
+  X(v8i16,         1, 1, 0, 1, 1, 1, v8i1)                                     \
+  X(v4i32,         1, 1, 0, 1, 1, 1, v4i1)                                     \
+  X(v4f32,         1, 0, 1, 0, 1, 1, v4i1)                                     \
 //#define X(tag, IsVec, IsInt, IsFloat, IsIntArith, IsLoadStore, IsParam,      \
 //          CompareResult)
 
diff --git a/src/IceTypes.h b/src/IceTypes.h
index f6705e9..380a613 100644
--- a/src/IceTypes.h
+++ b/src/IceTypes.h
@@ -22,7 +22,7 @@
 namespace Ice {
 
 enum Type {
-#define X(tag, sizeLog2, align, elts, elty, str) tag,
+#define X(tag, sizeLog2, align, elts, elty, str) IceType_##tag,
   ICETYPE_TABLE
 #undef X
       IceType_NUM
