Subzero: Refactor x86 register representation to actively use aliases.

Sets up additional register attributes, plus the notion of register classes, to enable robust usage of the high 8-bit GPRs (ah/bh/ch/dh), for both x86-32 and x86-64.  (Note that the x86-64 changes are currently untested.)

We add a Register Class field to the Variable class.  The default register class is a value corresponding to the variable's type, but the target can extend the set of register class values, and the target lowering can assign different register classes as needed.  The register allocator uses the register class instead of the type to determine the set of registers to draw from.

For x86-64, the high 8-bit registers are not included in the general register allocation pool, but there are explicit references to ah for lowering the div/rem instructions.

The target lowering is modified as needed to make sure types are appropriate and register use in instructions is legalized.

Some other fixes and cleanups are included in this CL:

* Makefile.standalone changes.  Source files are reordered so that the more expensive compiles are done earlier, speeding up parallel builds by decreasing fragmentation.  A dependency error is fixed for check-spec.

* A bug is fixed in advanced phi lowering.  When a temporary is introduced to break a cycle, we were neglecting to updated the predecessor count for one of the operands, leading to an assertion failure.  (Applying that fix to master resulted in no changes to spec2k code generation.)  A consistency check is added to help find future problems like this.  Also, refactored iteration over the Phi descriptor array to use range-based for loops and avoid directly indexing the array.

* Removed most of the "IceType_" prefixes in x-macro tables for brevity.

* Fix a correctness TODO in the register allocator.  This had no effect on spec2k code generation in master or in this CL, so we were probably just lucky.

* Made some much-needed s/Dest->getType()/Ty/ changes for brevity, in the target lowering sections that needed other changes.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4095
R=jpp@chromium.org

Review URL: https://codereview.chromium.org/1427973003 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 9eb934b..498c5e9 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -457,19 +457,17 @@
     return;
   // Sort by decreasing alignment.  This does not really matter at the moment,
   // but will allow compacting stack allocation when we fuse to one alloca.
-  std::sort(Allocas.begin(), Allocas.end(),
-            [](Inst *I1, Inst *I2) {
-              auto *A1 = llvm::dyn_cast<InstAlloca>(I1);
-              auto *A2 = llvm::dyn_cast<InstAlloca>(I2);
-              return A1->getAlignInBytes() > A2->getAlignInBytes();
-            });
-  for (Inst *Instr: Allocas) {
+  std::sort(Allocas.begin(), Allocas.end(), [](Inst *I1, Inst *I2) {
+    auto *A1 = llvm::dyn_cast<InstAlloca>(I1);
+    auto *A2 = llvm::dyn_cast<InstAlloca>(I2);
+    return A1->getAlignInBytes() > A2->getAlignInBytes();
+  });
+  for (Inst *Instr : Allocas) {
     auto *Alloca = llvm::cast<InstAlloca>(Instr);
     // Move the alloca to its sorted position.
-    InstAlloca *NewAlloca = InstAlloca::create(this,
-                                               Alloca->getSizeInBytes(),
-                                               Alloca->getAlignInBytes(),
-                                               Alloca->getDest());
+    InstAlloca *NewAlloca =
+        InstAlloca::create(this, Alloca->getSizeInBytes(),
+                           Alloca->getAlignInBytes(), Alloca->getDest());
     if (IsKnownFrameOffset)
       NewAlloca->setKnownFrameOffset();
     Insts.push_front(NewAlloca);
@@ -506,8 +504,7 @@
         // Allocations aligned more than the stack require a frame pointer.
         RequiresFramePointer = true;
         AlignedAllocas.push_back(Alloca);
-      }
-      else
+      } else
         FixedAllocas.push_back(Alloca);
     }
   }
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index e0628c4..004a0b3 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -304,6 +304,7 @@
   PhiDesc() = delete;
   PhiDesc(const PhiDesc &) = delete;
   PhiDesc &operator=(const PhiDesc &) = delete;
+
 public:
   PhiDesc(InstPhi *Phi, Variable *Dest) : Phi(Phi), Dest(Dest) {}
   PhiDesc(PhiDesc &&) = default;
@@ -457,7 +458,7 @@
         if (Item2.Processed)
           continue;
         // There shouldn't be two different Phis with the same Dest variable or
-          // register.
+        // register.
         assert((&Item == &Item2) || !sameVarOrReg(Target, Dest, Item2.Dest));
         if (sameVarOrReg(Target, Dest, Item2.Src))
           ++Item.NumPred;
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index dd70082..b71b0cb 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -55,13 +55,13 @@
   X(Reg_bl, 3, "bl", Reg_ebx, 0,1,0,0, 1,0,0,0,1, 0, 0,0,0,1,1,                \
     REGLIST2(RegX8632, ebx, bx))                                               \
   /* High 8-bit registers */                                                   \
-  X(Reg_ah, 4, "ah", Reg_eax, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_ah, 4, "ah", Reg_eax, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
     REGLIST2(RegX8632, eax, ax))                                               \
-  X(Reg_ch, 5, "ch", Reg_ecx, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_ch, 5, "ch", Reg_ecx, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
     REGLIST2(RegX8632, ecx, cx))                                               \
-  X(Reg_dh, 6, "dh", Reg_edx, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_dh, 6, "dh", Reg_edx, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
     REGLIST2(RegX8632, edx, dx))                                               \
-  X(Reg_bh, 7, "bh", Reg_ebx, 0,1,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_bh, 7, "bh", Reg_ebx, 0,1,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
     REGLIST2(RegX8632, ebx, bx))                                               \
   /* End of 8-bit register set */
 //#define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,
@@ -212,22 +212,22 @@
 //#define X(val, emit)
 
 #define ICETYPEX8632_TABLE                                                     \
-  /* tag, element type, cvt , sdss, pack, width, fld */                        \
-  X(IceType_void,  IceType_void, "?",  "",   "",  "",  "")                     \
-  X(IceType_i1,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i8,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i16,   IceType_void, "si", "",   "",  "w", "")                     \
-  X(IceType_i32,   IceType_void, "si", "",   "",  "l", "")                     \
-  X(IceType_i64,   IceType_void, "si", "",   "",  "q", "")                     \
-  X(IceType_f32,   IceType_void, "ss", "ss", "d", "",  "s")                    \
-  X(IceType_f64,   IceType_void, "sd", "sd", "q", "",  "l")                    \
-  X(IceType_v4i1,  IceType_i32,  "?",  "",   "d", "",  "")                     \
-  X(IceType_v8i1,  IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v16i1, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v16i8, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v8i16, IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v4i32, IceType_i32,  "dq", "",   "d", "",  "")                     \
-  X(IceType_v4f32, IceType_f32,  "ps", "",   "d", "",  "")
+  /* tag,  element type, cvt , sdss, pack, width, fld */                       \
+  X(void,  void,         "?",  "",   "",   "",    "")                          \
+  X(i1,    void,         "si", "",   "",   "b",   "")                          \
+  X(i8,    void,         "si", "",   "",   "b",   "")                          \
+  X(i16,   void,         "si", "",   "",   "w",   "")                          \
+  X(i32,   void,         "si", "",   "",   "l",   "")                          \
+  X(i64,   void,         "si", "",   "",   "q",   "")                          \
+  X(f32,   void,         "ss", "ss", "d",  "",    "s")                         \
+  X(f64,   void,         "sd", "sd", "q",  "",    "l")                         \
+  X(v4i1,  i32,          "?",  "",   "d",  "",    "")                          \
+  X(v8i1,  i16,          "?",  "",   "w",  "",    "")                          \
+  X(v16i1, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v16i8, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v8i16, i16,          "?",  "",   "w",  "",    "")                          \
+  X(v4i32, i32,          "dq", "",   "d",  "",    "")                          \
+  X(v4f32, f32,          "ps", "",   "d",  "",    "")
 //#define X(tag, elementty, cvt, sdss, pack, width, fld)
 
 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
diff --git a/src/IceInstX8664.def b/src/IceInstX8664.def
index b3f452f..b3cdb40 100644
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -292,22 +292,22 @@
 //#define X(val, emit)
 
 #define ICETYPEX8664_TABLE                                                     \
-  /* tag         , element type, cvt , sdss, pack, width, fld */               \
-  X(IceType_void,  IceType_void, "?",  "",   "",  "",  "")                     \
-  X(IceType_i1,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i8,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i16,   IceType_void, "si", "",   "",  "w", "")                     \
-  X(IceType_i32,   IceType_void, "si", "",   "",  "l", "")                     \
-  X(IceType_i64,   IceType_void, "si", "",   "",  "q", "")                     \
-  X(IceType_f32,   IceType_void, "ss", "ss", "d", "",  "s")                    \
-  X(IceType_f64,   IceType_void, "sd", "sd", "q", "",  "l")                    \
-  X(IceType_v4i1,  IceType_i32,  "?",  "",   "d", "",  "")                     \
-  X(IceType_v8i1,  IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v16i1, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v16i8, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v8i16, IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v4i32, IceType_i32,  "dq", "",   "d", "",  "")                     \
-  X(IceType_v4f32, IceType_f32,  "ps", "",   "d", "",  "")
+  /* tag,  element type, cvt , sdss, pack, width, fld */                       \
+  X(void,  void,         "?",  "",   "",   "",    "")                          \
+  X(i1,    void,         "si", "",   "",   "b",   "")                          \
+  X(i8,    void,         "si", "",   "",   "b",   "")                          \
+  X(i16,   void,         "si", "",   "",   "w",   "")                          \
+  X(i32,   void,         "si", "",   "",   "l",   "")                          \
+  X(i64,   void,         "si", "",   "",   "q",   "")                          \
+  X(f32,   void,         "ss", "ss", "d",  "",    "s")                         \
+  X(f64,   void,         "sd", "sd", "q",  "",    "l")                         \
+  X(v4i1,  i32,          "?",  "",   "d",  "",    "")                          \
+  X(v8i1,  i16,          "?",  "",   "w",  "",    "")                          \
+  X(v16i1, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v16i8, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v8i16, i16,          "?",  "",   "w",  "",    "")                          \
+  X(v4i32, i32,          "dq", "",   "d",  "",    "")                          \
+  X(v4f32, f32,          "ps", "",   "d",  "",    "")
 //#define X(tag, elementty, cvt, sdss, pack, width, fld)
 
 #endif // SUBZERO_SRC_ICEINSTX8664_DEF
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 6428f53..e109bf2 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -1384,38 +1384,35 @@
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(this->getSrcSize() == 1);
   Operand *Src0 = this->getSrc(0);
-  assert(llvm::isa<Variable>(Src0));
+  int32_t DestReg = this->getDest()->getRegNum();
+  int32_t SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
   switch (Src0->getType()) {
   default:
     llvm_unreachable("unexpected source type!");
     break;
   case IceType_i8:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax ||
+           DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ah);
     Str << "\t"
         << "cbtw";
     break;
   case IceType_i16:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
     Str << "\t"
         << "cwtd";
     break;
   case IceType_i32:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Str << "\t"
         << "cltd";
     break;
   case IceType_i64:
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Str << "\t"
         << "cdto";
     break;
@@ -1428,35 +1425,32 @@
       Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
   assert(this->getSrcSize() == 1);
   Operand *Src0 = this->getSrc(0);
-  assert(llvm::isa<Variable>(Src0));
+  int32_t DestReg = this->getDest()->getRegNum();
+  int32_t SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
   switch (Src0->getType()) {
   default:
     llvm_unreachable("unexpected source type!");
     break;
   case IceType_i8:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax ||
+           DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ah);
     Asm->cbw();
     break;
   case IceType_i16:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
     Asm->cwd();
     break;
   case IceType_i32:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Asm->cdq();
     break;
   case IceType_i64:
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Asm->cqo();
     break;
   }
@@ -2278,32 +2272,29 @@
   } else {
     Str << "\tmov"
         << (!isScalarFloatingType(DestTy)
-                ? this->getWidthString(SrcTy)
+                ? this->getWidthString(DestTy)
                 : InstX86Base<Machine>::Traits::TypeAttributes[DestTy]
                       .SdSsString) << "\t";
   }
-  // For an integer truncation operation, src is wider than dest. Ideally, we
-  // use a mov instruction whose data width matches the narrower dest. This is
-  // a problem if e.g. src is a register like esi or si where there is no 8-bit
-  // version of the register. To be safe, we instead widen the dest to match
-  // src. This works even for stack-allocated dest variables because
-  // typeWidthOnStack() pads to a 4-byte boundary even if only a lower portion
-  // is used.
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
   // TODO: This assert disallows usages such as copying a floating
   // point value between a vector and a scalar (which movss is used for). Clean
   // this up.
   assert(Func->getTarget()->typeWidthInBytesOnStack(DestTy) ==
          Func->getTarget()->typeWidthInBytesOnStack(SrcTy));
-  Src->emit(Func);
+  const Operand *NewSrc = Src;
+  if (auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    int32_t NewRegNum = Variable::NoRegister;
+    if (SrcVar->hasReg())
+      NewRegNum = InstX86Base<Machine>::Traits::getGprForType(
+          DestTy, SrcVar->getRegNum());
+    if (SrcTy != DestTy)
+      NewSrc = SrcVar->asType(DestTy, NewRegNum);
+  }
+  NewSrc->emit(Func);
   Str << ", ";
-  int32_t NewRegNum = Variable::NoRegister;
-  if (this->getDest()->hasReg())
-    NewRegNum = InstX86Base<Machine>::Traits::getGprForType(
-        SrcTy, this->getDest()->getRegNum());
-  const Variable *NewDest = SrcTy == DestTy
-                                ? this->getDest()
-                                : this->getDest()->asType(SrcTy, NewRegNum);
-  NewDest->emit(Func);
+  this->getDest()->emit(Func);
 }
 
 template <class Machine>
@@ -2330,13 +2321,8 @@
       Machine>::Traits::Assembler::GPREmitterAddrOp GPRAddrEmitter = {
       &InstX86Base<Machine>::Traits::Assembler::mov,
       &InstX86Base<Machine>::Traits::Assembler::mov};
-  // For an integer truncation operation, src is wider than dest. Ideally, we
-  // use a mov instruction whose data width matches the narrower dest. This is
-  // a problem if e.g. src is a register like esi or si where there is no 8-bit
-  // version of the register. To be safe, we instead widen the dest to match
-  // src. This works even for stack-allocated dest variables because
-  // typeWidthOnStack() pads to a 4-byte boundary even if only a lower portion
-  // is used.
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
   // TODO: This assert disallows usages such as copying a floating
   // point value between a vector and a scalar (which movss is used for). Clean
   // this up.
@@ -2366,7 +2352,7 @@
         return;
       }
       if (isScalarIntegerType(SrcTy)) {
-        DestTy = SrcTy;
+        SrcTy = DestTy;
       }
       emitIASRegOpTyGPR<Machine>(Func, DestTy, Dest, Src, GPRRegEmitter);
       return;
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 0addc88..8ad78a5 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -428,6 +428,23 @@
 
 Ostream &operator<<(Ostream &Str, const LiveRange &L);
 
+/// RegClass indicates the physical register class that a Variable may be
+/// register-allocated from.  By default, a variable's register class is
+/// directly associated with its type.  However, the target lowering may define
+/// additional target-specific register classes by extending the set of enum
+/// values.
+enum RegClass : uint8_t {
+// Define RC_void, RC_i1, RC_i8, etc.
+#define X(tag, sizeLog2, align, elts, elty, str) RC_##tag = IceType_##tag,
+  ICETYPE_TABLE
+#undef X
+      RC_Target,
+  // Leave plenty of space for target-specific values.
+  RC_Max = std::numeric_limits<uint8_t>::max()
+};
+static_assert(RC_Target == static_cast<RegClass>(IceType_NUM),
+              "Expected RC_Target and IceType_NUM to be the same");
+
 /// Variable represents an operand that is register-allocated or
 /// stack-allocated. If it is register-allocated, it will ultimately have a
 /// non-negative RegNum field.
@@ -493,6 +510,9 @@
     return RegRequirement == RR_MustNotHaveRegister;
   }
 
+  void setRegClass(uint8_t RC) { RegisterClass = static_cast<RegClass>(RC); }
+  RegClass getRegClass() const { return RegisterClass; }
+
   LiveRange &getLiveRange() { return Live; }
   const LiveRange &getLiveRange() const { return Live; }
   void setLiveRange(const LiveRange &Range) { Live = Range; }
@@ -537,7 +557,8 @@
 
 protected:
   Variable(OperandKind K, Type Ty, SizeT Index)
-      : Operand(K, Ty), Number(Index) {
+      : Operand(K, Ty), Number(Index),
+        RegisterClass(static_cast<RegClass>(Ty)) {
     Vars = VarsReal;
     Vars[0] = this;
     NumVars = 1;
@@ -553,6 +574,7 @@
   /// pointer and other physical registers specifically referenced by name.
   bool IgnoreLiveness = false;
   RegRequirement RegRequirement = RR_MayHaveRegister;
+  RegClass RegisterClass;
   /// RegNum is the allocated register, or NoRegister if it isn't
   /// register-allocated.
   int32_t RegNum = NoRegister;
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 6b8656b..e6f3f56 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -833,8 +833,7 @@
     Iter.Cur = Unhandled.back();
     Unhandled.pop_back();
     dumpLiveRangeTrace("\nConsidering  ", Iter.Cur);
-    Iter.RegMask =
-        RegMaskFull & Target->getRegisterSetForType(Iter.Cur->getType());
+    Iter.RegMask = RegMaskFull & Target->getRegistersForVariable(Iter.Cur);
     KillsRange.trim(Iter.Cur->getLiveRange().getStart());
 
     // Check for pre-colored ranges. If Cur is pre-colored, it definitely gets
@@ -862,11 +861,10 @@
     // Disable AllowOverlap if an Active variable, which is not Prefer, shares
     // Prefer's register, and has a definition within Cur's live range.
     if (Iter.AllowOverlap) {
+      const llvm::SmallBitVector &Aliases = *RegAliases[Iter.PreferReg];
       for (const Variable *Item : Active) {
         int32_t RegNum = Item->getRegNumTmp();
-        // TODO(stichnot): Consider aliases of RegNum.  This is probably a
-        // correctness issue.
-        if (Item != Iter.Prefer && RegNum == Iter.PreferReg &&
+        if (Item != Iter.Prefer && Aliases[RegNum] &&
             overlapsDefs(Func, Iter.Cur, Item)) {
           Iter.AllowOverlap = false;
           dumpDisableOverlap(Func, Item, "Active");
diff --git a/src/IceRegistersARM32.h b/src/IceRegistersARM32.h
index a32ee4a..64dcf0d 100644
--- a/src/IceRegistersARM32.h
+++ b/src/IceRegistersARM32.h
@@ -17,6 +17,7 @@
 
 #include "IceDefs.h"
 #include "IceInstARM32.def"
+#include "IceOperand.h" // RC_Target
 #include "IceTypes.h"
 
 namespace Ice {
@@ -118,6 +119,9 @@
   static const char *RegNames[];
 };
 
+// Extend enum RegClass with ARM32-specific register classes (if any).
+enum RegClassARM32 : uint8_t { RCARM32_NUM = RC_Target };
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEREGISTERSARM32_H
diff --git a/src/IceRegistersMIPS32.h b/src/IceRegistersMIPS32.h
index df13f0e..38335f8 100644
--- a/src/IceRegistersMIPS32.h
+++ b/src/IceRegistersMIPS32.h
@@ -17,6 +17,7 @@
 
 #include "IceDefs.h"
 #include "IceInstMIPS32.def"
+#include "IceOperand.h" // RC_Target
 #include "IceTypes.h"
 
 namespace Ice {
@@ -59,6 +60,9 @@
 
 } // end of namespace RegMIPS32
 
+// Extend enum RegClass with MIPS32-specific register classes (if any).
+enum RegClassMIPS32 : uint8_t { RCMIPS32_NUM = RC_Target };
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEREGISTERSMIPS32_H
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index f518225..aff423c 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -234,7 +234,8 @@
 
   virtual llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                               RegSetMask Exclude) const = 0;
-  virtual const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const = 0;
+  virtual const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const = 0;
   virtual const llvm::SmallBitVector &getAliasesForRegister(SizeT) const = 0;
 
   void regAlloc(RegAllocKind Kind);
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 62a903d..f6029c4 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -74,8 +74,11 @@
   IceString getRegName(SizeT RegNum, Type Ty) const override;
   llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                       RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSet[RC];
   }
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
     return RegisterAliases[Reg];
@@ -554,7 +557,7 @@
   bool MaybeLeafFunc = true;
   size_t SpillAreaSizeBytes = 0;
   // TODO(jpp): std::array instead of array.
-  static llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
+  static llvm::SmallBitVector TypeToRegisterSet[RCARM32_NUM];
   static llvm::SmallBitVector RegisterAliases[RegARM32::Reg_NUM];
   static llvm::SmallBitVector ScratchRegs;
   llvm::SmallBitVector RegsUsed;
diff --git a/src/IceTargetLoweringMIPS32.h b/src/IceTargetLoweringMIPS32.h
index 3cd1687..c01c6c2 100644
--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -42,8 +42,11 @@
   IceString getRegName(SizeT RegNum, Type Ty) const override;
   llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                       RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSet[RC];
   }
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
     return RegisterAliases[Reg];
@@ -231,7 +234,7 @@
 
   bool UsesFramePointer = false;
   bool NeedsStackAlignment = false;
-  static llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
+  static llvm::SmallBitVector TypeToRegisterSet[RCMIPS32_NUM];
   static llvm::SmallBitVector RegisterAliases[RegMIPS32::Reg_NUM];
   static llvm::SmallBitVector ScratchRegs;
   llvm::SmallBitVector RegsUsed;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 6918cdf..2ecd77d 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -74,7 +74,7 @@
 const MachineTraits<TargetX8632>::TableTypeX8632AttributesType
     MachineTraits<TargetX8632>::TableTypeX8632Attributes[] = {
 #define X(tag, elementty, cvt, sdss, pack, width, fld)                         \
-  { elementty }                                                                \
+  { IceType_##elementty }                                                      \
   ,
         ICETYPEX8632_TABLE
 #undef X
@@ -87,7 +87,7 @@
 const char *MachineTraits<TargetX8632>::TargetName = "X8632";
 
 template <>
-std::array<llvm::SmallBitVector, IceType_NUM>
+std::array<llvm::SmallBitVector, RCX86_NUM>
     TargetX86Base<TargetX8632>::TypeToRegisterSet = {};
 
 template <>
@@ -957,7 +957,7 @@
 };
 // Define a set of constants based on high-level table entries.
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  static const int _table1_##tag = tag;
+  static const int _table1_##tag = IceType_##tag;
 ICETYPE_TABLE
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index 1c92bed..cd26702 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -22,8 +22,9 @@
 #include "IceInstX8632.def"
 #include "IceOperand.h"
 #include "IceRegistersX8632.h"
-#include "IceTargetLoweringX8632.def"
 #include "IceTargetLowering.h"
+#include "IceTargetLoweringX8632.def"
+#include "IceTargetLoweringX86RegClass.h"
 
 #include <array>
 
@@ -398,7 +399,7 @@
   }
 
   static void initRegisterSet(
-      std::array<llvm::SmallBitVector, IceType_NUM> *TypeToRegisterSet,
+      std::array<llvm::SmallBitVector, RCX86_NUM> *TypeToRegisterSet,
       std::array<llvm::SmallBitVector, RegisterSet::Reg_NUM> *RegisterAliases,
       llvm::SmallBitVector *ScratchRegs) {
     llvm::SmallBitVector IntegerRegistersI32(RegisterSet::Reg_NUM);
@@ -406,6 +407,11 @@
     llvm::SmallBitVector IntegerRegistersI8(RegisterSet::Reg_NUM);
     llvm::SmallBitVector FloatRegisters(RegisterSet::Reg_NUM);
     llvm::SmallBitVector VectorRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc64To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc32To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc16To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc8RcvrRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector AhRcvrRegisters(RegisterSet::Reg_NUM);
     llvm::SmallBitVector InvalidRegisters(RegisterSet::Reg_NUM);
     ScratchRegs->resize(RegisterSet::Reg_NUM);
 #define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,     \
@@ -416,6 +422,11 @@
   (IntegerRegistersI8)[RegisterSet::val] = is8;                                \
   (FloatRegisters)[RegisterSet::val] = isXmm;                                  \
   (VectorRegisters)[RegisterSet::val] = isXmm;                                 \
+  (Trunc64To8Registers)[RegisterSet::val] = is64To8;                           \
+  (Trunc32To8Registers)[RegisterSet::val] = is32To8;                           \
+  (Trunc16To8Registers)[RegisterSet::val] = is16To8;                           \
+  (Trunc8RcvrRegisters)[RegisterSet::val] = isTrunc8Rcvr;                      \
+  (AhRcvrRegisters)[RegisterSet::val] = isAhRcvr;                              \
   (*RegisterAliases)[RegisterSet::val].resize(RegisterSet::Reg_NUM);           \
   for (SizeT RegAlias : aliases) {                                             \
     assert(!(*RegisterAliases)[RegisterSet::val][RegAlias] &&                  \
@@ -427,21 +438,26 @@
     REGX8632_TABLE;
 #undef X
 
-    (*TypeToRegisterSet)[IceType_void] = InvalidRegisters;
-    (*TypeToRegisterSet)[IceType_i1] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i8] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i16] = IntegerRegistersI16;
-    (*TypeToRegisterSet)[IceType_i32] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_i64] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_f32] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_f64] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_v4i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i8] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i16] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4i32] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_void] = InvalidRegisters;
+    (*TypeToRegisterSet)[RC_i1] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i8] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i16] = IntegerRegistersI16;
+    (*TypeToRegisterSet)[RC_i32] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_i64] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_f32] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_f64] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_v4i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i8] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i16] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4i32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RCX86_Is64To8] = Trunc64To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is32To8] = Trunc32To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is16To8] = Trunc16To8Registers;
+    (*TypeToRegisterSet)[RCX86_IsTrunc8Rcvr] = Trunc8RcvrRegisters;
+    (*TypeToRegisterSet)[RCX86_IsAhRcvr] = AhRcvrRegisters;
   }
 
   static llvm::SmallBitVector
@@ -512,7 +528,12 @@
     Index |= (is8 << (AttrKey++));                                             \
     Index |= (is16 << (AttrKey++));                                            \
     Index |= (is32 << (AttrKey++));                                            \
+    Index |= (is64 << (AttrKey++));                                            \
     Index |= (isXmm << (AttrKey++));                                           \
+    Index |= (is16To8 << (AttrKey++));                                         \
+    Index |= (is32To8 << (AttrKey++));                                         \
+    Index |= (is64To8 << (AttrKey++));                                         \
+    Index |= (isTrunc8Rcvr << (AttrKey++));                                    \
     /* val is assigned to an equivalence class based on its properties. */     \
     EquivalenceClasses[Index].push_back(RegisterSet::val);                     \
   }
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 1783b26..bdebfbe 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -74,7 +74,7 @@
 const MachineTraits<TargetX8664>::TableTypeX8664AttributesType
     MachineTraits<TargetX8664>::TableTypeX8664Attributes[] = {
 #define X(tag, elementty, cvt, sdss, pack, width, fld)                         \
-  { elementty }                                                                \
+  { IceType_##elementty }                                                      \
   ,
         ICETYPEX8664_TABLE
 #undef X
@@ -87,7 +87,7 @@
 const char *MachineTraits<TargetX8664>::TargetName = "X8664";
 
 template <>
-std::array<llvm::SmallBitVector, IceType_NUM>
+std::array<llvm::SmallBitVector, RCX86_NUM>
     TargetX86Base<TargetX8664>::TypeToRegisterSet = {};
 
 template <>
@@ -955,7 +955,7 @@
 };
 // Define a set of constants based on high-level table entries.
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  static const int _table1_##tag = tag;
+  static const int _table1_##tag = IceType_##tag;
 ICETYPE_TABLE
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index 331f076..f6c834a 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -24,6 +24,7 @@
 #include "IceRegistersX8664.h"
 #include "IceTargetLowering.h"
 #include "IceTargetLoweringX8664.def"
+#include "IceTargetLoweringX86RegClass.h"
 
 #include <array>
 
@@ -379,7 +380,7 @@
   static int32_t getGprForType(Type, int32_t RegNum) { return RegNum; }
 
   static void initRegisterSet(
-      std::array<llvm::SmallBitVector, IceType_NUM> *TypeToRegisterSet,
+      std::array<llvm::SmallBitVector, RCX86_NUM> *TypeToRegisterSet,
       std::array<llvm::SmallBitVector, RegisterSet::Reg_NUM> *RegisterAliases,
       llvm::SmallBitVector *ScratchRegs) {
     llvm::SmallBitVector IntegerRegistersI64(RegisterSet::Reg_NUM);
@@ -388,6 +389,11 @@
     llvm::SmallBitVector IntegerRegistersI8(RegisterSet::Reg_NUM);
     llvm::SmallBitVector FloatRegisters(RegisterSet::Reg_NUM);
     llvm::SmallBitVector VectorRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc64To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc32To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc16To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc8RcvrRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector AhRcvrRegisters(RegisterSet::Reg_NUM);
     llvm::SmallBitVector InvalidRegisters(RegisterSet::Reg_NUM);
     ScratchRegs->resize(RegisterSet::Reg_NUM);
 
@@ -400,6 +406,11 @@
   (IntegerRegistersI8)[RegisterSet::val] = is8;                                \
   (FloatRegisters)[RegisterSet::val] = isXmm;                                  \
   (VectorRegisters)[RegisterSet::val] = isXmm;                                 \
+  (Trunc64To8Registers)[RegisterSet::val] = is64To8;                           \
+  (Trunc32To8Registers)[RegisterSet::val] = is32To8;                           \
+  (Trunc16To8Registers)[RegisterSet::val] = is16To8;                           \
+  (Trunc8RcvrRegisters)[RegisterSet::val] = isTrunc8Rcvr;                      \
+  (AhRcvrRegisters)[RegisterSet::val] = isAhRcvr;                              \
   (*RegisterAliases)[RegisterSet::val].resize(RegisterSet::Reg_NUM);           \
   for (SizeT RegAlias : aliases) {                                             \
     assert(!(*RegisterAliases)[RegisterSet::val][RegAlias] &&                  \
@@ -411,21 +422,26 @@
     REGX8664_TABLE;
 #undef X
 
-    (*TypeToRegisterSet)[IceType_void] = InvalidRegisters;
-    (*TypeToRegisterSet)[IceType_i1] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i8] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i16] = IntegerRegistersI16;
-    (*TypeToRegisterSet)[IceType_i32] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_i64] = IntegerRegistersI64;
-    (*TypeToRegisterSet)[IceType_f32] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_f64] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_v4i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i8] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i16] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4i32] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_void] = InvalidRegisters;
+    (*TypeToRegisterSet)[RC_i1] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i8] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i16] = IntegerRegistersI16;
+    (*TypeToRegisterSet)[RC_i32] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_i64] = IntegerRegistersI64;
+    (*TypeToRegisterSet)[RC_f32] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_f64] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_v4i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i8] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i16] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4i32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RCX86_Is64To8] = Trunc64To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is32To8] = Trunc32To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is16To8] = Trunc16To8Registers;
+    (*TypeToRegisterSet)[RCX86_IsTrunc8Rcvr] = Trunc8RcvrRegisters;
+    (*TypeToRegisterSet)[RCX86_IsAhRcvr] = AhRcvrRegisters;
   }
 
   static llvm::SmallBitVector
@@ -498,6 +514,10 @@
     Index |= (is32 << (AttrKey++));                                            \
     Index |= (is64 << (AttrKey++));                                            \
     Index |= (isXmm << (AttrKey++));                                           \
+    Index |= (is16To8 << (AttrKey++));                                         \
+    Index |= (is32To8 << (AttrKey++));                                         \
+    Index |= (is64To8 << (AttrKey++));                                         \
+    Index |= (isTrunc8Rcvr << (AttrKey++));                                    \
     /* val is assigned to an equivalence class based on its properties. */     \
     EquivalenceClasses[Index].push_back(RegisterSet::val);                     \
   }
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 1b52aad..bd09b5d 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -20,6 +20,7 @@
 #include "IceInst.h"
 #include "IceSwitchLowering.h"
 #include "IceTargetLowering.h"
+#include "IceTargetLoweringX86RegClass.h"
 #include "IceUtils.h"
 
 #include <array>
@@ -73,8 +74,11 @@
   IceString getRegName(SizeT RegNum, Type Ty) const override;
   llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                       RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSet[RC];
   }
 
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
@@ -263,6 +267,7 @@
   static Type firstTypeThatFitsSize(uint32_t Size,
                                     uint32_t MaxSize = NoSizeLimit);
 
+  Variable *copyToReg8(Operand *Src, int32_t RegNum = Variable::NoRegister);
   Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
 
   /// \name Returns a vector in a register with the given constant entries.
@@ -674,7 +679,7 @@
   bool NeedsStackAlignment = false;
   size_t SpillAreaSizeBytes = 0;
   size_t FixedAllocaSizeBytes = 0;
-  static std::array<llvm::SmallBitVector, IceType_NUM> TypeToRegisterSet;
+  static std::array<llvm::SmallBitVector, RCX86_NUM> TypeToRegisterSet;
   static std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
       RegisterAliases;
   static llvm::SmallBitVector ScratchRegs;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index b22ec64..56ee04d 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1216,8 +1216,7 @@
     //   t1:ecx = c.lo & 0xff
     //   t2 = b.lo
     //   t3 = b.hi
-    T_1 = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
-    _mov(T_1, Src1Lo);
+    T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
     _mov(T_2, Src0Lo);
     _mov(T_3, Src0Hi);
     switch (Op) {
@@ -1295,6 +1294,7 @@
 template <class Machine>
 void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
   Variable *Dest = Inst->getDest();
+  Type Ty = Dest->getType();
   Operand *Src0 = legalize(Inst->getSrc(0));
   Operand *Src1 = legalize(Inst->getSrc(1));
   if (Inst->isCommutative()) {
@@ -1316,7 +1316,7 @@
     assert(SwapCount <= 1);
     (void)SwapCount;
   }
-  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
     // These x86-32 helper-call-involved instructions are lowered in this
     // separate switch. This is because loOperand() and hiOperand() may insert
     // redundant instructions for constant blinding and pooling. Such redundant
@@ -1463,7 +1463,7 @@
     }
     return;
   }
-  if (isVectorType(Dest->getType())) {
+  if (isVectorType(Ty)) {
     // TODO: Trap on integer divide and integer modulo by zero. See:
     // https://code.google.com/p/nativeclient/issues/detail?id=3899
     if (llvm::isa<typename Traits::X86OperandMem>(Src1))
@@ -1473,46 +1473,45 @@
       llvm_unreachable("Unknown arithmetic operator");
       break;
     case InstArithmetic::Add: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _padd(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::And: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _pand(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Or: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _por(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Xor: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _pxor(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Sub: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _psub(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Mul: {
-      bool TypesAreValidForPmull =
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
+      bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
       bool InstructionSetIsValidForPmull =
-          Dest->getType() == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
+          Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
-        Variable *T = makeReg(Dest->getType());
+        Variable *T = makeReg(Ty);
         _movp(T, Src0);
         _pmull(T, Src0 == Src1 ? T : Src1);
         _movp(Dest, T);
-      } else if (Dest->getType() == IceType_v4i32) {
+      } else if (Ty == IceType_v4i32) {
         // Lowering sequence:
         // Note: The mask arguments have index 0 on the left.
         //
@@ -1550,7 +1549,7 @@
         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
         _movp(Dest, T4);
-      } else if (Dest->getType() == IceType_v16i8) {
+      } else if (Ty == IceType_v16i8) {
         scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
       } else {
         llvm::report_fatal_error("Invalid vector multiply type");
@@ -1566,25 +1565,25 @@
       scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
       break;
     case InstArithmetic::Fadd: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _addps(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Fsub: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _subps(T, Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Fmul: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _mulps(T, Src0 == Src1 ? T : Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Fdiv: {
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(Ty);
       _movp(T, Src0);
       _divps(T, Src1);
       _movp(Dest, T);
@@ -1633,13 +1632,13 @@
     }
     // The 8-bit version of imul only allows the form "imul r/m8" where T must
     // be in al.
-    if (isByteSizedArithType(Dest->getType())) {
+    if (isByteSizedArithType(Ty)) {
       _mov(T, Src0, Traits::RegisterSet::Reg_al);
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
       _imul(T, Src0 == Src1 ? T : Src1);
       _mov(Dest, T);
     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-      T = makeReg(Dest->getType());
+      T = makeReg(Ty);
       _imul_imm(T, Src0, ImmConst);
       _mov(Dest, T);
     } else {
@@ -1650,76 +1649,51 @@
     break;
   case InstArithmetic::Shl:
     _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1)) {
-      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
-      _mov(Cl, Src1);
-      Src1 = Cl;
-    }
+    if (!llvm::isa<ConstantInteger32>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
     _shl(T, Src1);
     _mov(Dest, T);
     break;
   case InstArithmetic::Lshr:
     _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1)) {
-      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
-      _mov(Cl, Src1);
-      Src1 = Cl;
-    }
+    if (!llvm::isa<ConstantInteger32>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
     _shr(T, Src1);
     _mov(Dest, T);
     break;
   case InstArithmetic::Ashr:
     _mov(T, Src0);
-    if (!llvm::isa<ConstantInteger32>(Src1)) {
-      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
-      _mov(Cl, Src1);
-      Src1 = Cl;
-    }
+    if (!llvm::isa<ConstantInteger32>(Src1))
+      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
     _sar(T, Src1);
     _mov(Dest, T);
     break;
-  case InstArithmetic::Udiv:
+  case InstArithmetic::Udiv: {
     // div and idiv are the few arithmetic operators that do not allow
     // immediates as the operand.
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (isByteSizedArithType(Dest->getType())) {
-      // For 8-bit unsigned division we need to zero-extend al into ah. A mov
-      // $0, %ah (or xor %ah, %ah) would work just fine, except that the x86-64
-      // assembler refuses to encode %ah (encoding %spl with a REX prefix
-      // instead.) Accessing %ah in 64-bit is "tricky" as you can't encode %ah
-      // with any other 8-bit register except for %a[lh], %b[lh], %c[lh], and
-      // d[%lh], which means the X86 target lowering (and the register
-      // allocator) would have to be aware of this restriction. For now, we
-      // simply zero %eax completely, and move the dividend into %al.
-      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      Context.insert(InstFakeDef::create(Func, T_eax));
-      _xor(T_eax, T_eax);
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      _div(T, Src1, T);
-      _mov(Dest, T);
-      Context.insert(InstFakeUse::create(Func, T_eax));
-    } else {
-      Type Ty = Dest->getType();
-      uint32_t Eax = Traits::RegisterSet::Reg_eax;
-      uint32_t Edx = Traits::RegisterSet::Reg_edx;
-      switch (Ty) {
-      default:
-        llvm_unreachable("Bad type for udiv");
-      // fallthrough
-      case IceType_i32:
-        break;
-      case IceType_i16:
-        Eax = Traits::RegisterSet::Reg_ax;
-        Edx = Traits::RegisterSet::Reg_dx;
-        break;
-      }
-      Constant *Zero = Ctx->getConstantZero(Ty);
-      _mov(T, Src0, Eax);
-      _mov(T_edx, Zero, Edx);
-      _div(T, Src1, T_edx);
-      _mov(Dest, T);
+    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    switch (Ty) {
+    default:
+      llvm_unreachable("Bad type for udiv");
+    // fallthrough
+    case IceType_i32:
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
     }
-    break;
+    _mov(T, Src0, Eax);
+    _mov(T_edx, Ctx->getConstantZero(Ty), Edx);
+    _div(T, Src1, T_edx);
+    _mov(Dest, T);
+  } break;
   case InstArithmetic::Sdiv:
     // TODO(stichnot): Enable this after doing better performance and cross
     // testing.
@@ -1731,7 +1705,6 @@
         uint32_t UDivisor = static_cast<uint32_t>(Divisor);
         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
           uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          Type Ty = Dest->getType();
           // LLVM does the following for dest=src/(1<<log):
           //   t=src
           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
@@ -1757,7 +1730,7 @@
       }
     }
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    switch (Type Ty = Dest->getType()) {
+    switch (Ty) {
     default:
       llvm_unreachable("Bad type for sdiv");
     // fallthrough
@@ -1778,47 +1751,32 @@
     _idiv(T, Src1, T_edx);
     _mov(Dest, T);
     break;
-  case InstArithmetic::Urem:
+  case InstArithmetic::Urem: {
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (isByteSizedArithType(Dest->getType())) {
-      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      Context.insert(InstFakeDef::create(Func, T_eax));
-      _xor(T_eax, T_eax);
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      _div(T, Src1, T);
-      // shr $8, %eax shifts ah (i.e., the 8 bit remainder) into al. We don't
-      // mov %ah, %al because it would make x86-64 codegen more complicated. If
-      // this ever becomes a problem we can introduce a pseudo rem instruction
-      // that returns the remainder in %al directly (and uses a mov for copying
-      // %ah to %al.)
-      static constexpr uint8_t AlSizeInBits = 8;
-      _shr(T_eax, Ctx->getConstantInt8(AlSizeInBits));
-      _mov(Dest, T);
-      Context.insert(InstFakeUse::create(Func, T_eax));
-    } else {
-      Type Ty = Dest->getType();
-      uint32_t Eax = Traits::RegisterSet::Reg_eax;
-      uint32_t Edx = Traits::RegisterSet::Reg_edx;
-      switch (Ty) {
-      default:
-        llvm_unreachable("Bad type for urem");
-      // fallthrough
-      case IceType_i32:
-        break;
-      case IceType_i16:
-        Eax = Traits::RegisterSet::Reg_ax;
-        Edx = Traits::RegisterSet::Reg_dx;
-        break;
-      }
-      Constant *Zero = Ctx->getConstantZero(Ty);
-      T_edx = makeReg(Dest->getType(), Edx);
-      _mov(T_edx, Zero);
-      _mov(T, Src0, Eax);
-      _div(T_edx, Src1, T);
-      _mov(Dest, T_edx);
+    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    switch (Ty) {
+    default:
+      llvm_unreachable("Bad type for urem");
+    // fallthrough
+    case IceType_i32:
+      break;
+    case IceType_i16:
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
+      break;
+    case IceType_i8:
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
+      break;
     }
-    break;
-  case InstArithmetic::Srem:
+    T_edx = makeReg(Ty, Edx);
+    _mov(T_edx, Ctx->getConstantZero(Ty));
+    _mov(T, Src0, Eax);
+    _div(T_edx, Src1, T);
+    _mov(Dest, T_edx);
+  } break;
+  case InstArithmetic::Srem: {
     // TODO(stichnot): Enable this after doing better performance and cross
     // testing.
     if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
@@ -1829,7 +1787,6 @@
         uint32_t UDivisor = static_cast<uint32_t>(Divisor);
         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
           uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          Type Ty = Dest->getType();
           // LLVM does the following for dest=src%(1<<log):
           //   t=src
           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
@@ -1860,37 +1817,29 @@
       }
     }
     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    switch (Type Ty = Dest->getType()) {
+    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    switch (Ty) {
     default:
       llvm_unreachable("Bad type for srem");
     // fallthrough
     case IceType_i32:
-      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
-      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
-      _cbwdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      _mov(Dest, T_edx);
       break;
     case IceType_i16:
-      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
-      _mov(T, Src0, Traits::RegisterSet::Reg_ax);
-      _cbwdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      _mov(Dest, T_edx);
+      Eax = Traits::RegisterSet::Reg_ax;
+      Edx = Traits::RegisterSet::Reg_dx;
       break;
     case IceType_i8:
-      T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
-      // TODO(stichnot): Use register ah for T_edx, and remove the _shr().
-      // T_edx = makeReg(Ty, Traits::RegisterSet::Reg_ah);
-      _mov(T, Src0, Traits::RegisterSet::Reg_al);
-      _cbwdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      static constexpr uint8_t AlSizeInBits = 8;
-      _shr(T_edx, Ctx->getConstantInt8(AlSizeInBits));
-      _mov(Dest, T_edx);
+      Eax = Traits::RegisterSet::Reg_al;
+      Edx = Traits::RegisterSet::Reg_ah;
       break;
     }
-    break;
+    T_edx = makeReg(Ty, Edx);
+    _mov(T, Src0, Eax);
+    _cbwdq(T_edx, T);
+    _idiv(T_edx, Src1, T);
+    _mov(Dest, T_edx);
+  } break;
   case InstArithmetic::Fadd:
     _mov(T, Src0);
     _addss(T, Src1);
@@ -1913,7 +1862,6 @@
     break;
   case InstArithmetic::Frem: {
     constexpr SizeT MaxSrcs = 2;
-    Type Ty = Dest->getType();
     InstCall *Call = makeHelperCall(
         isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
     Call->addArg(Src0);
@@ -1991,6 +1939,7 @@
   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
   InstCast::OpKind CastKind = Inst->getCastKind();
   Variable *Dest = Inst->getDest();
+  Type DestTy = Dest->getType();
   switch (CastKind) {
   default:
     Func->setError("Cast type not supported");
@@ -2003,15 +1952,14 @@
     // we're unlikely to see something like that in the bitcode that the
     // optimizer wouldn't have already taken care of.
     Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(Dest->getType())) {
-      Type DestTy = Dest->getType();
+    if (isVectorType(DestTy)) {
       if (DestTy == IceType_v16i8) {
         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
-        Variable *OneMask = makeVectorOfOnes(Dest->getType());
+        Variable *OneMask = makeVectorOfOnes(DestTy);
         Variable *T = makeReg(DestTy);
         _movp(T, Src0RM);
         _pand(T, OneMask);
-        Variable *Zeros = makeVectorOfZeros(Dest->getType());
+        Variable *Zeros = makeVectorOfZeros(DestTy);
         _pcmpgt(T, Zeros);
         _movp(Dest, T);
       } else {
@@ -2026,7 +1974,7 @@
         _psra(T, ShiftConstant);
         _movp(Dest, T);
       }
-    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
       Constant *Shift = Ctx->getConstantInt32(31);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -2053,12 +2001,10 @@
       // shl t1, dst_bitwidth - 1
       // sar t1, dst_bitwidth - 1
       // dst = t1
-      size_t DestBits =
-          Traits::X86_CHAR_BIT * typeWidthInBytes(Dest->getType());
+      size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
-      Variable *T = makeReg(Dest->getType());
-      if (typeWidthInBytes(Dest->getType()) <=
-          typeWidthInBytes(Src0RM->getType())) {
+      Variable *T = makeReg(DestTy);
+      if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
         _mov(T, Src0RM);
       } else {
         // Widen the source using movsx or movzx. (It doesn't matter which one,
@@ -2070,7 +2016,7 @@
       _mov(Dest, T);
     } else {
       // t1 = movsx src; dst = t1
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _movsx(T, Src0RM);
       _mov(Dest, T);
     }
@@ -2078,15 +2024,14 @@
   }
   case InstCast::Zext: {
     Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(Dest->getType())) {
+    if (isVectorType(DestTy)) {
       // onemask = materialize(1,1,...); dest = onemask & src
-      Type DestTy = Dest->getType();
       Variable *OneMask = makeVectorOfOnes(DestTy);
       Variable *T = makeReg(DestTy);
       _movp(T, Src0RM);
       _pand(T, OneMask);
       _movp(Dest, T);
-    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
       // t1=movzx src; dst.lo=t1; dst.hi=0
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -2101,7 +2046,6 @@
       _mov(DestHi, Zero);
     } else if (Src0RM->getType() == IceType_i1) {
       // t = Src0RM; Dest = t
-      Type DestTy = Dest->getType();
       Variable *T = nullptr;
       if (DestTy == IceType_i8) {
         _mov(T, Src0RM);
@@ -2117,32 +2061,40 @@
       _mov(Dest, T);
     } else {
       // t1 = movzx src; dst = t1
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _movzx(T, Src0RM);
       _mov(Dest, T);
     }
     break;
   }
   case InstCast::Trunc: {
-    if (isVectorType(Dest->getType())) {
+    if (isVectorType(DestTy)) {
       // onemask = materialize(1,1,...); dst = src & onemask
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       Type Src0Ty = Src0RM->getType();
       Variable *OneMask = makeVectorOfOnes(Src0Ty);
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _movp(T, Src0RM);
       _pand(T, OneMask);
       _movp(Dest, T);
+    } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
+      // Make sure we truncate from and into valid registers.
+      Operand *Src0 = legalizeUndef(Inst->getSrc(0));
+      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
+        Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      Variable *T = copyToReg8(Src0RM);
+      if (DestTy == IceType_i1)
+        _and(T, Ctx->getConstantInt1(1));
+      _mov(Dest, T);
     } else {
       Operand *Src0 = legalizeUndef(Inst->getSrc(0));
       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
         Src0 = loOperand(Src0);
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
       // t1 = trunc Src0RM; Dest = t1
-      Variable *T = nullptr;
+      Variable *T = makeReg(DestTy);
       _mov(T, Src0RM);
-      if (Dest->getType() == IceType_i1)
-        _and(T, Ctx->getConstantInt1(1));
       _mov(Dest, T);
     }
     break;
@@ -2151,22 +2103,22 @@
   case InstCast::Fpext: {
     Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
     // t1 = cvt Src0RM; Dest = t1
-    Variable *T = makeReg(Dest->getType());
+    Variable *T = makeReg(DestTy);
     _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
     _mov(Dest, T);
     break;
   }
   case InstCast::Fptosi:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4i32 &&
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4i32 &&
              Inst->getSrc(0)->getType() == IceType_v4f32);
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       if (llvm::isa<typename Traits::X86OperandMem>(Src0RM))
         Src0RM = legalizeToReg(Src0RM);
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq);
       _movp(Dest, T);
-    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
       constexpr SizeT MaxSrcs = 1;
       Type SrcType = Inst->getSrc(0)->getType();
       InstCall *Call =
@@ -2179,40 +2131,44 @@
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
       Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && Dest->getType() == IceType_i64) {
+      if (Traits::Is64Bit && DestTy == IceType_i64) {
         T_1 = makeReg(IceType_i64);
       } else {
-        assert(Dest->getType() != IceType_i64);
+        assert(DestTy != IceType_i64);
         T_1 = makeReg(IceType_i32);
       }
       // cvt() requires its integer argument to be a GPR.
-      Variable *T_2 = makeReg(Dest->getType());
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (Dest->getType() == IceType_i1)
+      if (DestTy == IceType_i1)
         _and(T_2, Ctx->getConstantInt1(1));
       _mov(Dest, T_2);
     }
     break;
   case InstCast::Fptoui:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4i32 &&
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4i32 &&
              Inst->getSrc(0)->getType() == IceType_v4f32);
       constexpr SizeT MaxSrcs = 1;
       InstCall *Call = makeHelperCall(H_fptoui_4xi32_f32, Dest, MaxSrcs);
       Call->addArg(Inst->getSrc(0));
       lowerCall(Call);
-    } else if (Dest->getType() == IceType_i64 ||
-               (!Traits::Is64Bit && Dest->getType() == IceType_i32)) {
+    } else if (DestTy == IceType_i64 ||
+               (!Traits::Is64Bit && DestTy == IceType_i32)) {
       // Use a helper for both x86-32 and x86-64.
       constexpr SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
       Type SrcType = Inst->getSrc(0)->getType();
       IceString TargetString;
       if (Traits::Is64Bit) {
         TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
                                                          : H_fptoui_f64_i64;
-      } else if (isInt32Asserting32Or64(DestType)) {
+      } else if (isInt32Asserting32Or64(DestTy)) {
         TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
                                                          : H_fptoui_f64_i32;
       } else {
@@ -2226,39 +2182,43 @@
     } else {
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      assert(Dest->getType() != IceType_i64);
+      assert(DestTy != IceType_i64);
       Variable *T_1 = nullptr;
-      if (Traits::Is64Bit && Dest->getType() == IceType_i32) {
+      if (Traits::Is64Bit && DestTy == IceType_i32) {
         T_1 = makeReg(IceType_i64);
       } else {
-        assert(Dest->getType() != IceType_i32);
+        assert(DestTy != IceType_i32);
         T_1 = makeReg(IceType_i32);
       }
-      Variable *T_2 = makeReg(Dest->getType());
+      Variable *T_2 = makeReg(DestTy);
+      if (isByteSizedType(DestTy)) {
+        assert(T_1->getType() == IceType_i32);
+        T_1->setRegClass(RCX86_Is32To8);
+        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
+      }
       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (Dest->getType() == IceType_i1)
+      if (DestTy == IceType_i1)
         _and(T_2, Ctx->getConstantInt1(1));
       _mov(Dest, T_2);
     }
     break;
   case InstCast::Sitofp:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4f32 &&
+    if (isVectorType(DestTy)) {
+      assert(DestTy == IceType_v4f32 &&
              Inst->getSrc(0)->getType() == IceType_v4i32);
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       if (llvm::isa<typename Traits::X86OperandMem>(Src0RM))
         Src0RM = legalizeToReg(Src0RM);
-      Variable *T = makeReg(Dest->getType());
+      Variable *T = makeReg(DestTy);
       _cvt(T, Src0RM, Traits::Insts::Cvt::Dq2ps);
       _movp(Dest, T);
     } else if (!Traits::Is64Bit && Inst->getSrc(0)->getType() == IceType_i64) {
       // Use a helper for x86-32.
       constexpr SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
       InstCall *Call =
-          makeHelperCall(isFloat32Asserting32Or64(DestType) ? H_sitofp_i64_f32
-                                                            : H_sitofp_i64_f64,
+          makeHelperCall(isFloat32Asserting32Or64(DestTy) ? H_sitofp_i64_f32
+                                                          : H_sitofp_i64_f64,
                          Dest, MaxSrcs);
       // TODO: Call the correct compiler-rt helper function.
       Call->addArg(Inst->getSrc(0));
@@ -2275,7 +2235,7 @@
         assert(Src0RM->getType() != IceType_i64);
         T_1 = makeReg(IceType_i32);
       }
-      Variable *T_2 = makeReg(Dest->getType());
+      Variable *T_2 = makeReg(DestTy);
       if (Src0RM->getType() == T_1->getType())
         _mov(T_1, Src0RM);
       else
@@ -2287,8 +2247,7 @@
   case InstCast::Uitofp: {
     Operand *Src0 = Inst->getSrc(0);
     if (isVectorType(Src0->getType())) {
-      assert(Dest->getType() == IceType_v4f32 &&
-             Src0->getType() == IceType_v4i32);
+      assert(DestTy == IceType_v4f32 && Src0->getType() == IceType_v4i32);
       constexpr SizeT MaxSrcs = 1;
       InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs);
       Call->addArg(Src0);
@@ -2298,14 +2257,13 @@
       // Use a helper for x86-32 and x86-64. Also use a helper for i32 on
       // x86-32.
       constexpr SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
       IceString TargetString;
       if (isInt32Asserting32Or64(Src0->getType())) {
-        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32
-                                                          : H_uitofp_i32_f64;
+        TargetString = isFloat32Asserting32Or64(DestTy) ? H_uitofp_i32_f32
+                                                        : H_uitofp_i32_f64;
       } else {
-        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32
-                                                          : H_uitofp_i64_f64;
+        TargetString = isFloat32Asserting32Or64(DestTy) ? H_uitofp_i64_f32
+                                                        : H_uitofp_i64_f64;
       }
       InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
       Call->addArg(Src0);
@@ -2323,7 +2281,7 @@
         assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
         T_1 = makeReg(IceType_i32);
       }
-      Variable *T_2 = makeReg(Dest->getType());
+      Variable *T_2 = makeReg(DestTy);
       if (Src0RM->getType() == T_1->getType())
         _mov(T_1, Src0RM);
       else
@@ -2335,12 +2293,12 @@
   }
   case InstCast::Bitcast: {
     Operand *Src0 = Inst->getSrc(0);
-    if (Dest->getType() == Src0->getType()) {
+    if (DestTy == Src0->getType()) {
       InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
       lowerAssign(Assign);
       return;
     }
-    switch (Dest->getType()) {
+    switch (DestTy) {
     default:
       llvm_unreachable("Unexpected Bitcast dest type");
     case IceType_i8: {
@@ -2358,11 +2316,9 @@
     case IceType_i32:
     case IceType_f32: {
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      Type DestType = Dest->getType();
       Type SrcType = Src0RM->getType();
-      (void)DestType;
-      assert((DestType == IceType_i32 && SrcType == IceType_f32) ||
-             (DestType == IceType_f32 && SrcType == IceType_i32));
+      assert((DestTy == IceType_i32 && SrcType == IceType_f32) ||
+             (DestTy == IceType_f32 && SrcType == IceType_i32));
       // a.i32 = bitcast b.f32 ==>
       //   t.f32 = b.f32
       //   s.f32 = spill t.f32
@@ -2436,7 +2392,7 @@
       } else {
         Src0 = legalize(Src0);
         if (llvm::isa<typename Traits::X86OperandMem>(Src0)) {
-          Variable *T = Func->makeVariable(Dest->getType());
+          Variable *T = Func->makeVariable(DestTy);
           _movq(T, Src0);
           _movq(Dest, T);
           break;
@@ -3037,17 +2993,21 @@
         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
     Variable *T = makeReg(Ty);
     _movp(T, SourceVectRM);
-    if (Ty == IceType_v4f32)
+    if (Ty == IceType_v4f32) {
       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
-    else
-      // TODO(stichnot): For the pinsrb and pinsrw instructions, when the source
-      // operand is a register, it must be a full r32 register like eax, and not
-      // ax/al/ah.  For filetype=asm, InstX86Pinsr<Machine>::emit() compensates
-      // for the use of r16 and r8 by converting them through getBaseReg(),
-      // while emitIAS() validates that the original and base register encodings
-      // are the same.  But for an "interior" register like ah, it should
-      // probably be copied into an r32 via movzx so that the types work out.
+    } else {
+      // For the pinsrb and pinsrw instructions, when the source operand is a
+      // register, it must be a full r32 register like eax, and not ax/al/ah.
+      // For filetype=asm, InstX86Pinsr<Machine>::emit() compensates for the use
+      // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
+      // validates that the original and base register encodings are the same.
+      if (ElementRM->getType() == IceType_i8 &&
+          llvm::isa<Variable>(ElementRM)) {
+        // Don't use ah/bh/ch/dh for pinsrb.
+        ElementRM = copyToReg8(ElementRM);
+      }
       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
+    }
     _movp(Inst->getDest(), T);
   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
     // Use shufps or movss.
@@ -5354,6 +5314,67 @@
   return Traits::X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
 }
 
+/// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
+/// Src is assumed to already be legalized.  If the source operand is known to
+/// be a memory or immediate operand, a simple mov will suffice.  But if the
+/// source operand can be a physical register, then it must first be copied into
+/// a physical register that is truncable to 8-bit, then truncated into a
+/// physical register that can receive a truncation, and finally copied into the
+/// result 8-bit register (which in general can be any 8-bit register).  For
+/// example, moving %ebp into %ah may be accomplished as:
+///   movl %ebp, %edx
+///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
+///   movb %dl, %ah
+/// On the other hand, moving a memory or immediate operand into ah:
+///   movb 4(%ebp), %ah
+///   movb $my_imm, %ah
+///
+/// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
+/// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
+/// use RegNum=NoRegister and then let the caller do a separate copy into
+/// Reg_ah.
+///
+/// Note #2.  ConstantRelocatable operands are also put through this process
+/// (not truncated directly) because our ELF emitter does R_386_32 relocations
+/// but not R_386_8 relocations.
+///
+/// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
+/// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
+/// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
+/// to the pinsrb instruction.
+template <class Machine>
+Variable *TargetX86Base<Machine>::copyToReg8(Operand *Src, int32_t RegNum) {
+  Type Ty = Src->getType();
+  assert(isScalarIntegerType(Ty));
+  assert(Ty != IceType_i1);
+  Variable *Reg = makeReg(IceType_i8, RegNum);
+  Reg->setRegClass(RCX86_IsTrunc8Rcvr);
+  if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
+    Variable *SrcTruncable = makeReg(Ty);
+    switch (Ty) {
+    case IceType_i64:
+      SrcTruncable->setRegClass(RCX86_Is64To8);
+      break;
+    case IceType_i32:
+      SrcTruncable->setRegClass(RCX86_Is32To8);
+      break;
+    case IceType_i16:
+      SrcTruncable->setRegClass(RCX86_Is16To8);
+      break;
+    default:
+      // i8 - just use default register class
+      break;
+    }
+    Variable *SrcRcvr = makeReg(IceType_i8);
+    SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
+    _mov(SrcTruncable, Src);
+    _mov(SrcRcvr, SrcTruncable);
+    Src = SrcRcvr;
+  }
+  _mov(Reg, Src);
+  return Reg;
+}
+
 /// Helper for legalize() to emit the right code to lower an operand to a
 /// register of the appropriate type.
 template <class Machine>
diff --git a/src/IceTargetLoweringX86RegClass.h b/src/IceTargetLoweringX86RegClass.h
new file mode 100644
index 0000000..35ae316
--- /dev/null
+++ b/src/IceTargetLoweringX86RegClass.h
@@ -0,0 +1,36 @@
+//===- subzero/src/IceTargetLoweringX86RegClass.h - x86 reg class -*- C++ -*-=//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the X86 register class extensions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
+
+#include "IceOperand.h" // RC_Target
+
+namespace Ice {
+namespace X86Internal {
+
+// Extend enum RegClass with x86-specific register classes.
+enum RegClassX86 : uint8_t {
+  RCX86_Is64To8 = RC_Target, // 64-bit GPR trivially truncable to 8-bit
+  RCX86_Is32To8,             // 32-bit GPR trivially truncable to 8-bit
+  RCX86_Is16To8,             // 16-bit GPR trivially truncable to 8-bit
+  RCX86_IsTrunc8Rcvr,        // 8-bit GPR that can receive a trunc operation
+  RCX86_IsAhRcvr,            // 8-bit GPR that can be a mov dest from %ah
+  RCX86_NUM
+};
+
+} // end of namespace X86Internal
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
diff --git a/src/IceTypes.cpp b/src/IceTypes.cpp
index 3756a10..d51d0ba 100644
--- a/src/IceTypes.cpp
+++ b/src/IceTypes.cpp
@@ -95,7 +95,7 @@
 
 const struct TypeAttributeFields TypeAttributes[] = {
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  { sizeLog2, align, elts, elty, str }                                         \
+  { sizeLog2, align, elts, IceType_##elty, str }                               \
   ,
     ICETYPE_TABLE
 #undef X
@@ -120,7 +120,8 @@
           CompareResult)                                                       \
   {                                                                            \
     IsVec, IsInt, IsInt & !IsVec, IsInt & IsVec, IsIntArith, IsFloat,          \
-        IsFloat & !IsVec, IsFloat & IsVec, IsLoadStore, IsParam, CompareResult \
+        IsFloat & !IsVec, IsFloat & IsVec, IsLoadStore, IsParam,               \
+        IceType_##CompareResult                                                \
   }                                                                            \
   ,
     ICETYPE_PROPS_TABLE
diff --git a/src/IceTypes.def b/src/IceTypes.def
index 8db8c63..8149596 100644
--- a/src/IceTypes.def
+++ b/src/IceTypes.def
@@ -29,23 +29,23 @@
 //#define X(tag, str, is_elf64, e_machine, e_flags)
 
 #define ICETYPE_TABLE                                                          \
-  /* enum value, log_2(size), align, # elts, element type, printable */        \
-  /*     string (size and alignment in bytes) */                               \
-  X(IceType_void,  -1,  0,     1,      IceType_void, "void")                   \
-  X(IceType_i1,     0,  1,     1,      IceType_i1,   "i1")                     \
-  X(IceType_i8,     0,  1,     1,      IceType_i8,   "i8")                     \
-  X(IceType_i16,    1,  1,     1,      IceType_i16,  "i16")                    \
-  X(IceType_i32,    2,  1,     1,      IceType_i32,  "i32")                    \
-  X(IceType_i64,    3,  1,     1,      IceType_i64,  "i64")                    \
-  X(IceType_f32,    2,  4,     1,      IceType_f32,  "float")                  \
-  X(IceType_f64,    3,  8,     1,      IceType_f64,  "double")                 \
-  X(IceType_v4i1,   4,  1,     4,      IceType_i1,   "<4 x i1>")               \
-  X(IceType_v8i1,   4,  1,     8,      IceType_i1,   "<8 x i1>")               \
-  X(IceType_v16i1,  4,  1,    16,      IceType_i1,   "<16 x i1>")              \
-  X(IceType_v16i8,  4,  1,    16,      IceType_i8,   "<16 x i8>")              \
-  X(IceType_v8i16,  4,  2,     8,      IceType_i16,  "<8 x i16>")              \
-  X(IceType_v4i32,  4,  4,     4,      IceType_i32,  "<4 x i32>")              \
-  X(IceType_v4f32,  4,  4,     4,      IceType_f32,  "<4 x float>")            \
+  /* enum value, log_2(size), align, # elts, element type, */                  \
+  /*     printable string (size and alignment in bytes) */                     \
+  X(void,  -1,  0,  1, void, "void")                                           \
+  X(i1,     0,  1,  1, i1,   "i1")                                             \
+  X(i8,     0,  1,  1, i8,   "i8")                                             \
+  X(i16,    1,  1,  1, i16,  "i16")                                            \
+  X(i32,    2,  1,  1, i32,  "i32")                                            \
+  X(i64,    3,  1,  1, i64,  "i64")                                            \
+  X(f32,    2,  4,  1, f32,  "float")                                          \
+  X(f64,    3,  8,  1, f64,  "double")                                         \
+  X(v4i1,   4,  1,  4, i1,   "<4 x i1>")                                       \
+  X(v8i1,   4,  1,  8, i1,   "<8 x i1>")                                       \
+  X(v16i1,  4,  1, 16, i1,   "<16 x i1>")                                      \
+  X(v16i8,  4,  1, 16, i8,   "<16 x i8>")                                      \
+  X(v8i16,  4,  2,  8, i16,  "<8 x i16>")                                      \
+  X(v4i32,  4,  4,  4, i32,  "<4 x i32>")                                      \
+  X(v4f32,  4,  4,  4, f32,  "<4 x float>")                                    \
 //#define X(tag, sizeLog2, align, elts, elty, str)
 
 // Dictionary:
@@ -58,22 +58,22 @@
 //   CR - Result type of compare instruction for argument type
 //        (IceType_void if disallowed)
 #define ICETYPE_PROPS_TABLE                                                    \
-  /* Enum Value    V  I  F IA  LS P CR */                                      \
-  X(IceType_void,  0, 0, 0, 0, 0, 0, IceType_void)                             \
-  X(IceType_i1,    0, 1, 0, 0, 0, 0, IceType_i1)                               \
-  X(IceType_i8,    0, 1, 0, 1, 1, 0, IceType_i1)                               \
-  X(IceType_i16,   0, 1, 0, 1, 1, 0, IceType_i1)                               \
-  X(IceType_i32,   0, 1, 0, 1, 1, 1, IceType_i1)                               \
-  X(IceType_i64,   0, 1, 0, 1, 1, 1, IceType_i1)                               \
-  X(IceType_f32,   0, 0, 1, 0, 1, 1, IceType_i1)                               \
-  X(IceType_f64,   0, 0, 1, 0, 1, 1, IceType_i1)                               \
-  X(IceType_v4i1,  1, 1, 0, 0, 0, 1, IceType_v4i1)                             \
-  X(IceType_v8i1,  1, 1, 0, 0, 0, 1, IceType_v8i1)                             \
-  X(IceType_v16i1, 1, 1, 0, 0, 0, 1, IceType_v16i1)                            \
-  X(IceType_v16i8, 1, 1, 0, 1, 1, 1, IceType_v16i1)                            \
-  X(IceType_v8i16, 1, 1, 0, 1, 1, 1, IceType_v8i1)                             \
-  X(IceType_v4i32, 1, 1, 0, 1, 1, 1, IceType_v4i1)                             \
-  X(IceType_v4f32, 1, 0, 1, 0, 1, 1, IceType_v4i1)                             \
+  /* Enum Value    V  I  F IA LS  P  CR */                                     \
+  X(void,          0, 0, 0, 0, 0, 0, void)                                     \
+  X(i1,            0, 1, 0, 0, 0, 0, i1)                                       \
+  X(i8,            0, 1, 0, 1, 1, 0, i1)                                       \
+  X(i16,           0, 1, 0, 1, 1, 0, i1)                                       \
+  X(i32,           0, 1, 0, 1, 1, 1, i1)                                       \
+  X(i64,           0, 1, 0, 1, 1, 1, i1)                                       \
+  X(f32,           0, 0, 1, 0, 1, 1, i1)                                       \
+  X(f64,           0, 0, 1, 0, 1, 1, i1)                                       \
+  X(v4i1,          1, 1, 0, 0, 0, 1, v4i1)                                     \
+  X(v8i1,          1, 1, 0, 0, 0, 1, v8i1)                                     \
+  X(v16i1,         1, 1, 0, 0, 0, 1, v16i1)                                    \
+  X(v16i8,         1, 1, 0, 1, 1, 1, v16i1)                                    \
+  X(v8i16,         1, 1, 0, 1, 1, 1, v8i1)                                     \
+  X(v4i32,         1, 1, 0, 1, 1, 1, v4i1)                                     \
+  X(v4f32,         1, 0, 1, 0, 1, 1, v4i1)                                     \
 //#define X(tag, IsVec, IsInt, IsFloat, IsIntArith, IsLoadStore, IsParam,      \
 //          CompareResult)
 
diff --git a/src/IceTypes.h b/src/IceTypes.h
index f6705e9..380a613 100644
--- a/src/IceTypes.h
+++ b/src/IceTypes.h
@@ -22,7 +22,7 @@
 namespace Ice {
 
 enum Type {
-#define X(tag, sizeLog2, align, elts, elty, str) tag,
+#define X(tag, sizeLog2, align, elts, elty, str) IceType_##tag,
   ICETYPE_TABLE
 #undef X
       IceType_NUM