Add 3rd argument register to X86.

Implement more instructions.

Change-Id: I3af7bbaf18eedc6537f1cfc2d57c4f6106fb5164
diff --git a/src/calling_convention_x86.cc b/src/calling_convention_x86.cc
index ee002c5..15f4495 100644
--- a/src/calling_convention_x86.cc
+++ b/src/calling_convention_x86.cc
@@ -91,9 +91,12 @@
   if (entry_spills_.size() == 0) {
     size_t num_spills = NumArgs() + NumLongOrDoubleArgs();
     if (num_spills > 0) {
-      entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EDX));
+      entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(ECX));
       if (num_spills > 1) {
-        entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(ECX));
+        entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EDX));
+        if (num_spills > 2) {
+          entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EBX));
+        }
       }
     }
   }
diff --git a/src/compiled_method.cc b/src/compiled_method.cc
index bf88880..3ac32d4 100644
--- a/src/compiled_method.cc
+++ b/src/compiled_method.cc
@@ -133,7 +133,7 @@
     case kThumb2:
       return RoundUp(offset, kArmAlignment);
     case kX86:
-      return offset;
+      return RoundUp(offset, kX86Alignment);
     default:
       LOG(FATAL) << "Unknown InstructionSet: " << static_cast<int>(instruction_set);
       return 0;
diff --git a/src/compiler/codegen/CodegenUtil.cc b/src/compiler/codegen/CodegenUtil.cc
index 2e2c254..f2449e5 100644
--- a/src/compiler/codegen/CodegenUtil.cc
+++ b/src/compiler/codegen/CodegenUtil.cc
@@ -819,7 +819,8 @@
         } else {
             cUnit->assemblerRetries++;
             if (cUnit->assemblerRetries > MAX_ASSEMBLER_RETRIES) {
-                LOG(FATAL) << "Assembler error - too many retries";
+              oatCodegenDump(cUnit);
+              LOG(FATAL) << "Assembler error - too many retries";
             }
             // Redo offsets and try again
             oatAssignOffsets(cUnit);
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index cc0d624..c5b28b3 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -178,11 +178,7 @@
         if (arg1.wide == 0) {
             loadValueDirectFixed(cUnit, arg1, rARG2);
         } else {
-#if defined(TARGET_X86)
-            UNIMPLEMENTED(FATAL);
-#else
             loadValueDirectWideFixed(cUnit, arg1, rARG2, rARG3);
-#endif
         }
     }
     oatClobberCalleeSave(cUnit);
@@ -274,11 +270,7 @@
     if (arg2.wide == 0) {
         loadValueDirectFixed(cUnit, arg2, rARG2);
     } else {
-#if defined(TARGET_X86)
-        UNIMPLEMENTED(FATAL);
-#else
         loadValueDirectWideFixed(cUnit, arg2, rARG2, rARG3);
-#endif
     }
     loadConstant(cUnit, rARG0, arg0);
     oatClobberCalleeSave(cUnit);
@@ -1406,10 +1398,21 @@
 
     // Now, redo loadValues in case they didn't survive the call
 
-    int regPtr;
     rlArray = loadValue(cUnit, rlArray, kCoreReg);
     rlIndex = loadValue(cUnit, rlIndex, kCoreReg);
 
+#if defined(TARGET_X86)
+    if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
+        /* if (rlIndex >= [rlArray + lenOffset]) goto kThrowArrayBounds */
+        genRegMemCheck(cUnit, kCondUge, rlIndex.lowReg, rlArray.lowReg,
+                       lenOffset, mir, kThrowArrayBounds);
+    }
+    rlSrc = loadValue(cUnit, rlSrc, regClass);
+    storeBaseIndexedDisp(cUnit, NULL, rlArray.lowReg, rlIndex.lowReg, scale,
+                         dataOffset, rlSrc.lowReg, INVALID_REG, kWord,
+                         INVALID_SREG);
+#else
+    int regPtr;
     if (oatIsTemp(cUnit, rlArray.lowReg)) {
         oatClobber(cUnit, rlArray.lowReg);
         regPtr = rlArray.lowReg;
@@ -1437,6 +1440,7 @@
     }
     storeBaseIndexed(cUnit, regPtr, rlIndex.lowReg, rlSrc.lowReg,
                      scale, kWord);
+#endif
     markGCCard(cUnit, rlSrc.lowReg, rlArray.lowReg);
 }
 
@@ -1555,10 +1559,10 @@
       dataOffset = Array::DataOffset(sizeof(int32_t)).Int32Value();
     }
 
-    int regPtr;
     rlArray = loadValue(cUnit, rlArray, kCoreReg);
     rlIndex = loadValue(cUnit, rlIndex, kCoreReg);
-
+#if !defined(TARGET_X86)
+    int regPtr;
     if (oatIsTemp(cUnit, rlArray.lowReg)) {
         oatClobber(cUnit, rlArray.lowReg);
         regPtr = rlArray.lowReg;
@@ -1566,10 +1570,21 @@
         regPtr = oatAllocTemp(cUnit);
         opRegCopy(cUnit, regPtr, rlArray.lowReg);
     }
+#endif
 
     /* null object? */
     genNullCheck(cUnit, rlArray.sRegLow, rlArray.lowReg, mir);
 
+#if defined(TARGET_X86)
+    if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
+        /* if (rlIndex >= [rlArray + lenOffset]) goto kThrowArrayBounds */
+        genRegMemCheck(cUnit, kCondUge, rlIndex.lowReg, rlArray.lowReg,
+                       lenOffset, mir, kThrowArrayBounds);
+    }
+    rlSrc = loadValue(cUnit, rlSrc, regClass);
+    storeBaseIndexedDisp(cUnit, NULL, rlArray.lowReg, rlIndex.lowReg, scale, dataOffset,
+                         rlSrc.lowReg, rlSrc.highReg, size, INVALID_SREG);
+#else
     bool needsRangeCheck = (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK));
     int regLen = INVALID_REG;
     if (needsRangeCheck) {
@@ -1612,6 +1627,7 @@
         storeBaseIndexed(cUnit, regPtr, rlIndex.lowReg, rlSrc.lowReg,
                          scale, size);
     }
+#endif
 }
 
 void genLong3Addr(CompilationUnit* cUnit, MIR* mir, OpKind firstOp,
@@ -1893,20 +1909,18 @@
             opRegRegImm(cUnit, kOpAsr, rlResult.lowReg, tReg, k);
         }
     } else {
-        int cReg = oatAllocTemp(cUnit);
-        loadConstant(cUnit, cReg, lit - 1);
         int tReg1 = oatAllocTemp(cUnit);
         int tReg2 = oatAllocTemp(cUnit);
         if (lit == 2) {
             opRegRegImm(cUnit, kOpLsr, tReg1, rlSrc.lowReg, 32 - k);
             opRegRegReg(cUnit, kOpAdd, tReg2, tReg1, rlSrc.lowReg);
-            opRegRegReg(cUnit, kOpAnd, tReg2, tReg2, cReg);
+            opRegRegImm(cUnit, kOpAnd, tReg2, tReg2, lit -1);
             opRegRegReg(cUnit, kOpSub, rlResult.lowReg, tReg2, tReg1);
         } else {
             opRegRegImm(cUnit, kOpAsr, tReg1, rlSrc.lowReg, 31);
             opRegRegImm(cUnit, kOpLsr, tReg1, tReg1, 32 - k);
             opRegRegReg(cUnit, kOpAdd, tReg2, tReg1, rlSrc.lowReg);
-            opRegRegReg(cUnit, kOpAnd, tReg2, tReg2, cReg);
+            opRegRegImm(cUnit, kOpAnd, tReg2, tReg2, lit - 1);
             opRegRegReg(cUnit, kOpSub, rlResult.lowReg, tReg2, tReg1);
         }
     }
diff --git a/src/compiler/codegen/GenInvoke.cc b/src/compiler/codegen/GenInvoke.cc
index 037a9bb..ebc8bc2 100644
--- a/src/compiler/codegen/GenInvoke.cc
+++ b/src/compiler/codegen/GenInvoke.cc
@@ -51,13 +51,8 @@
 
     if (cUnit->numIns == 0)
         return;
-#if !defined(TARGET_X86)
     const int numArgRegs = 3;
     static int argRegs[] = {rARG1, rARG2, rARG3};
-#else
-    const int numArgRegs = 2;
-    static int argRegs[] = {rARG1, rARG2};
-#endif
     int startVReg = cUnit->numDalvikRegisters - cUnit->numIns;
     /*
      * Copy incoming arguments to their proper home locations.
@@ -425,11 +420,7 @@
                 reg = rlArg.highReg;
             } else {
                 // rARG2 & rARG3 can safely be used here
-#if defined(TARGET_X86)
-                UNIMPLEMENTED(FATAL);
-#else
                 reg = rARG3;
-#endif
                 loadWordDisp(cUnit, rSP,
                              oatSRegOffset(cUnit, rlArg.sRegLow) + 4, reg);
                 callState = nextCallInsn(cUnit, mir, callState, dexIdx,
@@ -453,12 +444,8 @@
                 highReg = rlArg.highReg;
             } else {
                 lowReg = rARG2;
-#if defined(TARGET_X86)
-                UNIMPLEMENTED(FATAL);
-#else
-                highReg = rARG3;
-#endif
                 if (rlArg.wide) {
+                    highReg = rARG3;
                     loadValueDirectWideFixed(cUnit, rlArg, lowReg, highReg);
                 } else {
                     loadValueDirectFixed(cUnit, rlArg, lowReg);
diff --git a/src/compiler/codegen/x86/ArchFactory.cc b/src/compiler/codegen/x86/ArchFactory.cc
index efa54e0..eec1cbd 100644
--- a/src/compiler/codegen/x86/ArchFactory.cc
+++ b/src/compiler/codegen/x86/ArchFactory.cc
@@ -214,7 +214,7 @@
       thisLIR = NEXT_LIR(thisLIR)) {
 
     /* Branch to the next instruction */
-    if (thisLIR->opcode == kX86Jmp) {
+    if (thisLIR->opcode == kX86Jmp8 || thisLIR->opcode == kX86Jmp32) {
       LIR* nextLIR = thisLIR;
 
       while (true) {
diff --git a/src/compiler/codegen/x86/Assemble.cc b/src/compiler/codegen/x86/Assemble.cc
index d2a33ea..b9dd978 100644
--- a/src/compiler/codegen/x86/Assemble.cc
+++ b/src/compiler/codegen/x86/Assemble.cc
@@ -257,6 +257,8 @@
   EXT_0F_ENCODING_MAP(Ucomiss,   0x00, 0x2E),
   EXT_0F_ENCODING_MAP(Comisd,    0x66, 0x2F),
   EXT_0F_ENCODING_MAP(Comiss,    0x00, 0x2F),
+  EXT_0F_ENCODING_MAP(Orps,      0x00, 0x56),
+  EXT_0F_ENCODING_MAP(Xorps,     0x00, 0x57),
   EXT_0F_ENCODING_MAP(Addsd,     0xF2, 0x58),
   EXT_0F_ENCODING_MAP(Addss,     0xF3, 0x58),
   EXT_0F_ENCODING_MAP(Mulsd,     0xF2, 0x59),
@@ -268,6 +270,8 @@
   EXT_0F_ENCODING_MAP(Divsd,     0xF2, 0x5E),
   EXT_0F_ENCODING_MAP(Divss,     0xF3, 0x5E),
 
+  { kX86PsllqRI, kRegImm, IS_BINARY_OP, { 0, 0, 0x0F, 0x73, 0, 7, 0, 1 }, "PsllqRI", "!0r, !1d" },
+
   EXT_0F_ENCODING_MAP(Movdxr,    0x66, 0x6E),
   EXT_0F_ENCODING_MAP(Movdrx,    0x66, 0x7E),
 
@@ -283,8 +287,10 @@
   EXT_0F_ENCODING_MAP(Movsx16, 0x00, 0xBF),
 #undef EXT_0F_ENCODING_MAP
 
-  { kX86Jcc,   kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0x70, 0, 0, 0, 0, 0 }, "Jcc", "!1c !0t" },
-  { kX86Jmp,   kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0xE9, 0, 0, 0, 0, 0 }, "Jmp", "!0t" },
+  { kX86Jcc8,  kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0x70, 0,    0, 0, 0, 0 }, "Jcc8",  "!1c !0t" },
+  { kX86Jcc32, kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0x0F, 0x80, 0, 0, 0, 0 }, "Jcc32", "!1c !0t" },
+  { kX86Jmp8,  kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0xEB, 0,    0, 0, 0, 0 }, "Jmp8",  "!0t" },
+  { kX86Jmp32, kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0xE9, 0,    0, 0, 0, 0 }, "Jmp32", "!0t" },
   { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH,               { 0,             0, 0xE8, 0, 0, 0, 0, 0 }, "CallR", "!0r" },
   { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0, 0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
   { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0, 0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
@@ -359,9 +365,14 @@
     case kRegThread:  // lir operands - 0: reg, 1: disp
       return computeSize(entry, 0x12345678, false);  // displacement size is always 32bit
     case kRegImm: {  // lir operands - 0: reg, 1: immediate
-      int reg = lir->operands[0];
-      // AX opcodes don't require the modrm byte.
-      return computeSize(entry, 0, false) - (reg == rAX ? 1 : 0);
+      size_t size = computeSize(entry, 0, false);
+      if (entry->skeleton.ax_opcode == 0) {
+        return size;
+      } else {
+        // AX opcodes don't require the modrm byte.
+        int reg = lir->operands[0];
+        return size - (reg == rAX ? 1 : 0);
+      }
     }
     case kMemImm:  // lir operands - 0: base, 1: disp, 2: immediate
       CHECK_NE(lir->operands[0], static_cast<int>(rSP));  // TODO: add extra SIB byte
@@ -403,10 +414,20 @@
       return computeSize(entry, lir->operands[1], false);
     case kArrayCond:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: cond
       return computeSize(entry, lir->operands[3], true);
-    case kJcc: case kJmp:
-      // Jumps only return the short form length, the correct length will be assigned to LIR
-      // flags.size during assembly.
-      return 2;
+    case kJcc:
+      if (lir->opcode == kX86Jcc8) {
+        return 2;  // opcode + rel8
+      } else {
+        DCHECK(lir->opcode == kX86Jcc32);
+        return 6;  // 2 byte opcode + rel32
+      }
+    case kJmp:
+      if (lir->opcode == kX86Jmp8) {
+        return 2;  // opcode + rel8
+      } else {
+        DCHECK(lir->opcode == kX86Jmp32);
+        return 5;  // opcode + rel32
+      }
     case kCall:
       switch(lir->opcode) {
         case kX86CallR: return 2;  // opcode modrm
@@ -586,6 +607,12 @@
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
 }
 
+static void emitArrayReg(CompilationUnit* cUnit, const X86EncodingMap* entry,
+                         uint8_t base, uint8_t index, int scale, int disp, uint8_t reg) {
+  // Opcode will flip operands.
+  emitRegArray(cUnit, entry, reg, base, index, scale, disp);
+}
+
 static void emitRegThread(CompilationUnit* cUnit, const X86EncodingMap* entry,
                           uint8_t reg, int disp) {
   DCHECK_NE(entry->skeleton.prefix1, 0);
@@ -770,11 +797,50 @@
   cUnit->codeBuffer.push_back((imm >> 24) & 0xFF);
 }
 
+static void emitShiftRegImm(CompilationUnit* cUnit, const X86EncodingMap* entry,
+                       uint8_t reg, int imm) {
+  if (entry->skeleton.prefix1 != 0) {
+    cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
+    if (entry->skeleton.prefix2 != 0) {
+      cUnit->codeBuffer.push_back(entry->skeleton.prefix2);
+    }
+  } else {
+    DCHECK_EQ(0, entry->skeleton.prefix2);
+  }
+  if (imm != 1) {
+    cUnit->codeBuffer.push_back(entry->skeleton.opcode);
+  } else {
+    // Shorter encoding for 1 bit shift
+    cUnit->codeBuffer.push_back(entry->skeleton.ax_opcode);
+  }
+  if (entry->skeleton.opcode == 0x0F) {
+    cUnit->codeBuffer.push_back(entry->skeleton.extra_opcode1);
+    if (entry->skeleton.extra_opcode1 == 0x38 || entry->skeleton.extra_opcode2 == 0x3A) {
+      cUnit->codeBuffer.push_back(entry->skeleton.extra_opcode2);
+    } else {
+      DCHECK_EQ(0, entry->skeleton.extra_opcode2);
+    }
+  } else {
+    DCHECK_EQ(0, entry->skeleton.extra_opcode1);
+    DCHECK_EQ(0, entry->skeleton.extra_opcode2);
+  }
+  DCHECK_LT(reg, 8);
+  uint8_t modrm = (0 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+  cUnit->codeBuffer.push_back(modrm);
+  if (imm != 1) {
+    DCHECK_EQ(entry->skeleton.immediate_bytes, 1);
+    DCHECK(IS_SIMM8(imm));
+    cUnit->codeBuffer.push_back(imm & 0xFF);
+  }
+}
+
 static void emitJmp(CompilationUnit* cUnit, const X86EncodingMap* entry, int rel) {
-  if (IS_SIMM8(rel)) {
+  if (entry->opcode == kX86Jmp8) {
+    DCHECK(IS_SIMM8(rel));
     cUnit->codeBuffer.push_back(0xEB);
     cUnit->codeBuffer.push_back(rel & 0xFF);
   } else {
+    DCHECK(entry->opcode == kX86Jmp32);
     cUnit->codeBuffer.push_back(0xE9);
     cUnit->codeBuffer.push_back(rel & 0xFF);
     cUnit->codeBuffer.push_back((rel >> 8) & 0xFF);
@@ -786,10 +852,12 @@
 static void emitJcc(CompilationUnit* cUnit, const X86EncodingMap* entry,
                     int rel, uint8_t cc) {
   DCHECK_LT(cc, 16);
-  if (IS_SIMM8(rel)) {
+  if (entry->opcode == kX86Jcc8) {
+    DCHECK(IS_SIMM8(rel));
     cUnit->codeBuffer.push_back(0x70 | cc);
     cUnit->codeBuffer.push_back(rel & 0xFF);
   } else {
+    DCHECK(entry->opcode == kX86Jcc32);
     cUnit->codeBuffer.push_back(0x0F);
     cUnit->codeBuffer.push_back(0x80 | cc);
     cUnit->codeBuffer.push_back(rel & 0xFF);
@@ -889,45 +957,53 @@
 
     if (lir->flags.pcRelFixup) {
       switch (lir->opcode) {
-        case kX86Jcc: {
-            LIR *targetLIR = lir->target;
-            DCHECK(targetLIR != NULL);
-            int delta = 0;
-            intptr_t pc;
-            if (IS_SIMM8(lir->operands[0])) {
-              pc = lir->offset + 2 /* opcode + rel8 */;
-            } else {
-              pc = lir->offset + 6 /* 2 byte opcode + rel32 */;
-            }
-            intptr_t target = targetLIR->offset;
-            delta = target - pc;
-            if (IS_SIMM8(delta) != IS_SIMM8(lir->operands[0])) {
-              res = kRetryAll;
-            }
-            lir->operands[0] = delta;
-            break;
+        case kX86Jcc8: {
+          LIR *targetLIR = lir->target;
+          DCHECK(targetLIR != NULL);
+          int delta = 0;
+          intptr_t pc;
+          if (IS_SIMM8(lir->operands[0])) {
+            pc = lir->offset + 2 /* opcode + rel8 */;
+          } else {
+            pc = lir->offset + 6 /* 2 byte opcode + rel32 */;
+          }
+          intptr_t target = targetLIR->offset;
+          delta = target - pc;
+          if (IS_SIMM8(delta) != IS_SIMM8(lir->operands[0])) {
+            LOG(INFO) << "Retry for JCC growth at " << lir->offset
+                << " delta: " << delta << " old delta: " << lir->operands[0];
+            lir->opcode = kX86Jcc32;
+            oatSetupResourceMasks(lir);
+            res = kRetryAll;
+          }
+          lir->operands[0] = delta;
+          break;
         }
-        case kX86Jmp: {
-            LIR *targetLIR = lir->target;
-            DCHECK(targetLIR != NULL);
-            int delta = 0;
-            intptr_t pc;
-            if (IS_SIMM8(lir->operands[0])) {
-              pc = lir->offset + 2 /* opcode + rel8 */;
-            } else {
-              pc = lir->offset + 5 /* opcode + rel32 */;
-            }
-            intptr_t target = targetLIR->offset;
-            delta = target - pc;
-            if (!(cUnit->disableOpt & (1 << kSafeOptimizations)) && lir->operands[0] == 0) {
-              // Useless branch
-              lir->flags.isNop = true;
-              res = kRetryAll;
-            } else if (IS_SIMM8(delta) != IS_SIMM8(lir->operands[0])) {
-              res = kRetryAll;
-            }
-            lir->operands[0] = delta;
-            break;
+        case kX86Jmp8: {
+          LIR *targetLIR = lir->target;
+          DCHECK(targetLIR != NULL);
+          int delta = 0;
+          intptr_t pc;
+          if (IS_SIMM8(lir->operands[0])) {
+            pc = lir->offset + 2 /* opcode + rel8 */;
+          } else {
+            pc = lir->offset + 5 /* opcode + rel32 */;
+          }
+          intptr_t target = targetLIR->offset;
+          delta = target - pc;
+          if (!(cUnit->disableOpt & (1 << kSafeOptimizations)) && lir->operands[0] == 0) {
+            // Useless branch
+            lir->flags.isNop = true;
+            LOG(INFO) << "Retry for useless branch at " << lir->offset;
+            res = kRetryAll;
+          } else if (IS_SIMM8(delta) != IS_SIMM8(lir->operands[0])) {
+            LOG(INFO) << "Retry for JMP growth at " << lir->offset;
+            lir->opcode = kX86Jmp32;
+            oatSetupResourceMasks(lir);
+            res = kRetryAll;
+          }
+          lir->operands[0] = delta;
+          break;
         }
         default:
           break;
@@ -967,6 +1043,10 @@
       case kMemReg:  // lir operands - 0: base, 1: disp, 2: reg
         emitMemReg(cUnit, entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
+      case kArrayReg:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
+        emitArrayReg(cUnit, entry, lir->operands[0], lir->operands[1], lir->operands[2],
+                     lir->operands[3], lir->operands[4]);
+        break;
       case kRegMem:  // lir operands - 0: reg, 1: base, 2: disp
         emitRegMem(cUnit, entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
@@ -989,6 +1069,9 @@
       case kMovRegImm:  // lir operands - 0: reg, 1: immediate
         emitMovRegImm(cUnit, entry, lir->operands[0], lir->operands[1]);
         break;
+      case kShiftRegImm:  // lir operands - 0: reg, 1: immediate
+        emitShiftRegImm(cUnit, entry, lir->operands[0], lir->operands[1]);
+        break;
       case kJmp:  // lir operands - 0: rel
         emitJmp(cUnit, entry, lir->operands[0]);
         break;
@@ -1012,10 +1095,11 @@
         emitUnimplemented(cUnit, entry, lir);
         break;
     }
-    CHECK_EQ(static_cast<size_t>(oatGetInsnSize(lir)),
-             cUnit->codeBuffer.size() - starting_cbuf_size)
-        << "Instruction size mismatch for entry: " << EncodingMap[lir->opcode].name;
-
+    if (entry->kind != kJcc && entry->kind != kJmp) {
+      CHECK_EQ(static_cast<size_t>(oatGetInsnSize(lir)),
+               cUnit->codeBuffer.size() - starting_cbuf_size)
+          << "Instruction size mismatch for entry: " << EncodingMap[lir->opcode].name;
+    }
   }
   return res;
 }
diff --git a/src/compiler/codegen/x86/FP/X86FP.cc b/src/compiler/codegen/x86/FP/X86FP.cc
index 52b4fc4..c916640 100644
--- a/src/compiler/codegen/x86/FP/X86FP.cc
+++ b/src/compiler/codegen/x86/FP/X86FP.cc
@@ -213,7 +213,7 @@
   }
   LIR* branch = NULL;
   if (unorderedGt) {
-    branch = newLIR2(cUnit, kX86Jcc, 0, kX86CondPE);
+    branch = newLIR2(cUnit, kX86Jcc8, 0, kX86CondPE);
   }
   newLIR2(cUnit, kX86Set8R, rlResult.lowReg, kX86CondA /* above - unsigned > */);
   newLIR2(cUnit, kX86Sbb32RI, rlResult.lowReg, 0);
diff --git a/src/compiler/codegen/x86/X86/Factory.cc b/src/compiler/codegen/x86/X86/Factory.cc
index 96fa08a..aef5879 100644
--- a/src/compiler/codegen/x86/X86/Factory.cc
+++ b/src/compiler/codegen/x86/X86/Factory.cc
@@ -32,7 +32,7 @@
 #endif
 };
 /*static*/ int reservedRegs[] = {rSP};
-/*static*/ int coreTemps[] = {rAX, rCX, rDX};
+/*static*/ int coreTemps[] = {rAX, rCX, rDX, rBX};
 /*static*/ int fpRegs[] = {
     fr0, fr1, fr2, fr3, fr4, fr5, fr6, fr7,
 #ifdef TARGET_REX_SUPPORT
@@ -93,25 +93,24 @@
  * 2) The codegen is under fixed register usage
  */
 LIR *loadConstantNoClobber(CompilationUnit *cUnit, int rDest, int value) {
-  LIR *res;
-
   int rDestSave = rDest;
-  int isFpReg = FPREG(rDest);
-  if (isFpReg) {
+  if (FPREG(rDest)) {
+    if (value == 0) {
+      return newLIR2(cUnit, kX86XorpsRR, rDest, rDest);
+    }
     DCHECK(SINGLEREG(rDest));
     rDest = oatAllocTemp(cUnit);
   }
 
-  /* See if the value can be constructed cheaply */
+  LIR *res;
   if (value == 0) {
     res = newLIR2(cUnit, kX86Xor32RR, rDest, rDest);
   } else {
     res = newLIR2(cUnit, kX86Mov32RI, rDest, value);
   }
 
-  if (isFpReg) {
-    UNIMPLEMENTED(FATAL);
-    newLIR2(cUnit, kX86Mov32RR, rDest, rDestSave);
+  if (FPREG(rDestSave)) {
+    newLIR2(cUnit, kX86MovdxrRR, rDestSave, rDest);
     oatFreeTemp(cUnit, rDest);
   }
 
@@ -120,7 +119,7 @@
 
 LIR* opBranchUnconditional(CompilationUnit *cUnit, OpKind op) {
   CHECK_EQ(op, kOpUncondBr);
-  return newLIR1(cUnit, kX86Jmp, 0 /* offset to be patched */ );
+  return newLIR1(cUnit, kX86Jmp8, 0 /* offset to be patched */ );
 }
 
 LIR *loadMultiple(CompilationUnit *cUnit, int rBase, int rMask);
@@ -128,7 +127,7 @@
 X86ConditionCode oatX86ConditionEncoding(ConditionCode cond);
 LIR* opCondBranch(CompilationUnit* cUnit, ConditionCode cc, LIR* target)
 {
-  LIR* branch = newLIR2(cUnit, kX86Jcc, 0 /* offset to be patched */,
+  LIR* branch = newLIR2(cUnit, kX86Jcc8, 0 /* offset to be patched */,
                         oatX86ConditionEncoding(cc));
   branch->target = target;
   return branch;
@@ -285,13 +284,18 @@
   if (op == kOpMul) {
     X86OpCode opcode = IS_SIMM8(value) ? kX86Imul32RRI8 : kX86Imul32RRI;
     return newLIR3(cUnit, opcode, rDest, rSrc, value);
-  }
-  if (op == kOpLsl && value >= 0 && value <= 3) { // lea shift special case
-    return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc /* base */,
-                   r4sib_no_index /* index */, value /* scale */, value /* disp */);
+  } else if (op == kOpAnd) {
+    if (value == 0xFF) {
+      return newLIR2(cUnit, kX86Movzx8RR, rDest, rSrc);
+    } else if (value == 0xFFFF) {
+      return newLIR2(cUnit, kX86Movzx16RR, rDest, rSrc);
+    }
   }
   if (rDest != rSrc) {
-    if (op == kOpAdd) { // lea add special case
+    if (op == kOpLsl && value >= 0 && value <= 3) { // lea shift special case
+      return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc /* base */,
+                     r4sib_no_index /* index */, value /* scale */, value /* disp */);
+    } else if (op == kOpAdd) { // lea add special case
       return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc /* base */,
                      r4sib_no_index /* index */, 0 /* scale */, value /* disp */);
     }
@@ -326,8 +330,26 @@
                                      int rDestHi, int valLo, int valHi)
 {
     LIR *res;
-    res = loadConstantNoClobber(cUnit, rDestLo, valLo);
-    loadConstantNoClobber(cUnit, rDestHi, valHi);
+    if (FPREG(rDestLo)) {
+      DCHECK(FPREG(rDestHi));  // ignore rDestHi
+      if (valLo == 0 && valHi == 0) {
+        return newLIR2(cUnit, kX86XorpsRR, rDestLo, rDestLo);
+      } else {
+        if (valLo == 0) {
+          res = newLIR2(cUnit, kX86XorpsRR, rDestLo, rDestLo);
+        } else {
+          res = loadConstantNoClobber(cUnit, rDestLo, valLo);
+        }
+        if (valHi != 0) {
+          loadConstantNoClobber(cUnit, rDestHi, valHi);
+          newLIR2(cUnit, kX86PsllqRI, rDestHi, 32);
+          newLIR2(cUnit, kX86OrpsRR, rDestLo, rDestHi);
+        }
+      }
+    } else {
+      res = loadConstantNoClobber(cUnit, rDestLo, valLo);
+      loadConstantNoClobber(cUnit, rDestHi, valHi);
+    }
     return res;
 }
 
@@ -593,23 +615,22 @@
                              rDestLo, rDestHi, kLong, sReg);
 }
 
-LIR *storeBaseDispBody(CompilationUnit *cUnit, int rBase,
-                       int displacement, int rSrc, int rSrcHi,
-                       OpSize size)
-{
-  LIR *res = NULL;
+LIR* storeBaseIndexedDisp(CompilationUnit *cUnit, MIR *mir,
+                          int rBase, int rIndex, int scale, int displacement,
+                          int rSrc, int rSrcHi,
+                          OpSize size, int sReg) {
   LIR *store = NULL;
   LIR *store2 = NULL;
-  X86OpCode opcode = kX86Bkpt;
+  bool isArray = rIndex != INVALID_REG;
   bool pair = false;
   bool is64bit = false;
+  X86OpCode opcode = kX86Nop;
   switch (size) {
     case kLong:
     case kDouble:
       is64bit = true;
       if (FPREG(rSrc)) {
-        pair = false;
-        opcode = kX86MovsdMR;
+        opcode = isArray ? kX86MovsdAR : kX86MovsdMR;
         if (DOUBLEREG(rSrc)) {
           rSrc = rSrc - FP_DOUBLE;
         } else {
@@ -619,61 +640,61 @@
         rSrcHi = rSrc + 1;
       } else {
         pair = true;
-        opcode = kX86Mov32MR;
+        opcode = isArray ? kX86Mov32AR  : kX86Mov32MR;
       }
       // TODO: double store is to unaligned address
       DCHECK_EQ((displacement & 0x3), 0);
       break;
     case kWord:
     case kSingle:
-      opcode = kX86Mov32MR;
+      opcode = isArray ? kX86Mov32AR : kX86Mov32MR;
       if (FPREG(rSrc)) {
-        opcode = kX86MovssMR;
+        opcode = isArray ? kX86MovssAR : kX86MovssMR;
         DCHECK(SINGLEREG(rSrc));
       }
       DCHECK_EQ((displacement & 0x3), 0);
       break;
     case kUnsignedHalf:
     case kSignedHalf:
-      opcode = kX86Mov16MR;
+      opcode = isArray ? kX86Mov16AR : kX86Mov16MR;
       DCHECK_EQ((displacement & 0x1), 0);
       break;
     case kUnsignedByte:
     case kSignedByte:
-      opcode = kX86Mov8MR;
+      opcode = isArray ? kX86Mov8AR : kX86Mov8MR;
       break;
     default:
-      LOG(FATAL) << "Bad case in storeBaseIndexedBody";
+      LOG(FATAL) << "Bad case in loadBaseIndexedDispBody";
   }
 
-  if (!pair) {
-    store = res = newLIR3(cUnit, opcode, rBase, displacement, rSrc);
+  if (!isArray) {
+    if (!pair) {
+      store = newLIR3(cUnit, opcode, rBase, displacement + LOWORD_OFFSET, rSrc);
+    } else {
+      store = newLIR3(cUnit, opcode, rBase, displacement + LOWORD_OFFSET, rSrc);
+      store2 = newLIR3(cUnit, opcode, rBase, displacement + HIWORD_OFFSET, rSrcHi);
+    }
   } else {
-    store = res = newLIR3(cUnit, opcode, rBase, displacement + LOWORD_OFFSET, rSrc);
-    store2 = newLIR3(cUnit, opcode, rBase, displacement + HIWORD_OFFSET, rSrcHi);
-  }
-
-  if (rBase == rSP) {
-    annotateDalvikRegAccess(store, (displacement + LOWORD_OFFSET) >> 2,
-                            false /* isLoad */, is64bit);
-    if (pair) {
-      annotateDalvikRegAccess(store2, (displacement + HIWORD_OFFSET) >> 2,
-                              false /* isLoad */, is64bit);
+    if (!pair) {
+      store = newLIR5(cUnit, opcode, rBase, rIndex, scale, displacement + LOWORD_OFFSET, rSrc);
+    } else {
+      store = newLIR5(cUnit, opcode, rBase, rIndex, scale, displacement + LOWORD_OFFSET, rSrc);
+      store2 = newLIR5(cUnit, opcode, rBase, rIndex, scale, displacement + HIWORD_OFFSET, rSrcHi);
     }
   }
-  return res;
+
+  return store;
 }
 
-LIR *storeBaseDisp(CompilationUnit *cUnit, int rBase,
-                       int displacement, int rSrc, OpSize size)
-{
-    return storeBaseDispBody(cUnit, rBase, displacement, rSrc, -1, size);
+LIR *storeBaseDisp(CompilationUnit *cUnit, int rBase, int displacement, int rSrc, OpSize size) {
+    return storeBaseIndexedDisp(cUnit, NULL, rBase, INVALID_REG, 0, displacement,
+                                rSrc, INVALID_REG, size, INVALID_SREG);
 }
 
-LIR *storeBaseDispWide(CompilationUnit *cUnit, int rBase,
-                           int displacement, int rSrcLo, int rSrcHi)
-{
-    return storeBaseDispBody(cUnit, rBase, displacement, rSrcLo, rSrcHi, kLong);
+LIR *storeBaseDispWide(CompilationUnit *cUnit, int rBase, int displacement,
+                       int rSrcLo, int rSrcHi) {
+  return storeBaseIndexedDisp(cUnit, NULL, rBase, INVALID_REG, 0, displacement,
+                              rSrcLo, rSrcHi, kLong, INVALID_SREG);
 }
 
 void storePair(CompilationUnit *cUnit, int base, int lowReg, int highReg)
diff --git a/src/compiler/codegen/x86/X86/Gen.cc b/src/compiler/codegen/x86/X86/Gen.cc
index 5542317..6f33b56 100644
--- a/src/compiler/codegen/x86/X86/Gen.cc
+++ b/src/compiler/codegen/x86/X86/Gen.cc
@@ -406,7 +406,7 @@
 {
   newLIR2(cUnit, kX86Cmp32RR, src1, src2);
   X86ConditionCode cc = oatX86ConditionEncoding(cond);
-  LIR* branch = newLIR2(cUnit, kX86Jcc, 0 /* lir operand for Jcc offset */ , cc);
+  LIR* branch = newLIR2(cUnit, kX86Jcc8, 0 /* lir operand for Jcc offset */ , cc);
   branch->target = target;
   return branch;
 }
@@ -417,7 +417,7 @@
   // TODO: when checkValue == 0 and reg is rCX, use the jcxz/nz opcode
   newLIR2(cUnit, kX86Cmp32RI, reg, checkValue);
   X86ConditionCode cc = oatX86ConditionEncoding(cond);
-  LIR* branch = newLIR2(cUnit, kX86Jcc, 0 /* lir operand for Jcc offset */ , cc);
+  LIR* branch = newLIR2(cUnit, kX86Jcc8, 0 /* lir operand for Jcc offset */ , cc);
   branch->target = target;
   return branch;
 }
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index a767ff8..9b9fc6b 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -32,14 +32,16 @@
  * caller save places a burden on up-calls to save/restore the callee save register, however, there
  * are few registers that are callee save in the ABI. Changing something that is caller save and
  * making it callee save places a burden on down-calls to save/restore the callee save register.
- * For these reasons we aim to match native conventions for caller and callee save
+ * For these reasons we aim to match native conventions for caller and callee save. The first 4
+ * registers can be used for byte operations, for this reason they are preferred for temporary
+ * scratch registers.
  *
  * General Purpose Register:
  *  Native: x86         | x86-64 / x32      | ART
  *  r0/eax: caller save | caller save       | caller, Method*, scratch, return value
- *  r1/ecx: caller save | caller save, arg4 | caller, arg2, scratch
- *  r2/edx: caller save | caller save, arg3 | caller, arg1, scratch, high half of long return
- *  r3/ebx: callee save | callee save       | callee, available for dalvik register promotion
+ *  r1/ecx: caller save | caller save, arg4 | caller, arg1, scratch
+ *  r2/edx: caller save | caller save, arg3 | caller, arg2, scratch, high half of long return
+ *  r3/ebx: callEE save | callEE save       | callER, arg3, scratch
  *  r4/esp: stack pointer
  *  r5/ebp: callee save | callee save       | callee, available for dalvik register promotion
  *  r6/esi: callEE save | callER save, arg2 | callee, available for dalvik register promotion
@@ -228,8 +230,9 @@
  */
 
 #define rARG0 rAX
-#define rARG1 rDX
-#define rARG2 rCX
+#define rARG1 rCX
+#define rARG2 rDX
+#define rARG3 rBX
 #define rRET0 rAX
 #define rRET1 rDX
 #define rINVOKE_TGT rAX
@@ -417,6 +420,8 @@
     Binary0fOpCode(kX86Ucomiss),    // unordered float compare
     Binary0fOpCode(kX86Comisd),     // double compare
     Binary0fOpCode(kX86Comiss),     // float compare
+    Binary0fOpCode(kX86Orps),       // or of floating point registers
+    Binary0fOpCode(kX86Xorps),      // xor of floating point registers
     Binary0fOpCode(kX86Addsd),      // double add
     Binary0fOpCode(kX86Addss),      // float add
     Binary0fOpCode(kX86Mulsd),      // double multiply
@@ -425,8 +430,9 @@
     Binary0fOpCode(kX86Cvtsd2ss),   // double to float
     Binary0fOpCode(kX86Subsd),      // double subtract
     Binary0fOpCode(kX86Subss),      // float subtract
-    Binary0fOpCode(kX86Divsd),      // double subtract
-    Binary0fOpCode(kX86Divss),      // float subtract
+    Binary0fOpCode(kX86Divsd),      // double divide
+    Binary0fOpCode(kX86Divss),      // float divide
+    kX86PsllqRI,                    // shift of floating point registers
     Binary0fOpCode(kX86Movdxr),     // move into xmm from gpr
     Binary0fOpCode(kX86Movdrx),     // move into reg from xmm
     kX86Set8R, kX86Set8M, kX86Set8A,// set byte depending on condition operand
@@ -437,8 +443,8 @@
     Binary0fOpCode(kX86Movsx8),     // sign-extend 8-bit value
     Binary0fOpCode(kX86Movsx16),    // sign-extend 16-bit value
 #undef Binary0fOpCode
-    kX86Jcc,    // jCC rel; lir operands - 0: rel, 1: CC, target assigned
-    kX86Jmp,    // jmp rel; lir operands - 0: rel, target assigned
+    kX86Jcc8, kX86Jcc32,  // jCC rel8/32; lir operands - 0: rel, 1: CC, target assigned
+    kX86Jmp8, kX86Jmp32,  // jmp rel8/32; lir operands - 0: rel, target assigned
     kX86CallR,  // call reg; lir operands - 0: reg
     kX86CallM,  // call [base + disp]; lir operands - 0: base, 1: disp
     kX86CallA,  // call [base + index * scale + disp]
diff --git a/src/compiler/codegen/x86/X86RallocUtil.cc b/src/compiler/codegen/x86/X86RallocUtil.cc
index 156a2d5..ba5c063 100644
--- a/src/compiler/codegen/x86/X86RallocUtil.cc
+++ b/src/compiler/codegen/x86/X86RallocUtil.cc
@@ -96,7 +96,6 @@
 /* Clobber all regs that might be used by an external C call */
 extern void oatClobberCalleeSave(CompilationUnit *cUnit)
 {
-    oatClobber(cUnit, rBX);
     oatClobber(cUnit, rBP);
     oatClobber(cUnit, rSI);
     oatClobber(cUnit, rDI);
diff --git a/src/globals.h b/src/globals.h
index 4300a6e..0cf4260 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -46,6 +46,9 @@
 // Required ARM instruction alignment
 const int kArmAlignment = 4;
 
+// Required X86 instruction alignment
+const int kX86Alignment = 16;
+
 // System page size.  Normally you're expected to get this from
 // sysconf(_SC_PAGESIZE) or some system-specific define (usually
 // PAGESIZE or PAGE_SIZE).  If we use a simple compile-time constant
diff --git a/src/jni_internal_x86.cc b/src/jni_internal_x86.cc
index f9871c4..86d7749 100644
--- a/src/jni_internal_x86.cc
+++ b/src/jni_internal_x86.cc
@@ -42,26 +42,27 @@
   UniquePtr<X86Assembler> assembler(down_cast<X86Assembler*>(Assembler::Create(kX86)));
 #define __ assembler->
   size_t num_arg_array_bytes = NumArgArrayBytes(shorty, shorty_len);
-  // Size of frame = return address + Method* + possible receiver + arg array size
+  // Size of frame = return address + saved EBX + Method* + possible receiver + arg array size
   // Note, space is left in the frame to flush arguments in registers back to out locations.
-  size_t frame_size = 2 * kPointerSize + (is_static ? 0 : kPointerSize) + num_arg_array_bytes;
+  size_t frame_size = 3 * kPointerSize + (is_static ? 0 : kPointerSize) + num_arg_array_bytes;
   size_t pad_size = RoundUp(frame_size, kStackAlignment) - frame_size;
 
   Register rMethod = EAX;
   __ movl(rMethod,   Address(ESP, 4));     // EAX = method
-  Register rReceiver = EDX;
+  Register rReceiver = ECX;
   if (!is_static) {
-    __ movl(rReceiver, Address(ESP, 8));   // EDX = receiver
+    __ movl(rReceiver, Address(ESP, 8));   // ECX = receiver
   }
-  Register rArgArray = ECX;
-  __ movl(rArgArray, Address(ESP, 16));    // ECX = arg array
+  // Save EBX
+  __ pushl(EBX);
+  Register rArgArray = EBX;
+  __ movl(rArgArray, Address(ESP, 20));    // EBX = arg array
 
   // TODO: optimize the frame set up to avoid excessive SP math
   // Push padding
   if (pad_size != 0) {
     __ subl(ESP, Immediate(pad_size));
   }
-
   // Push/copy arguments.
   size_t arg_count = (shorty_len - 1);
   size_t dst_offset = num_arg_array_bytes;
@@ -87,33 +88,48 @@
     }
   }
 
-  // Backing space for receiver
+  // Backing space for receiver.
   if (!is_static) {
     __ pushl(Immediate(0));
   }
-  // Push 0 as NULL Method* thereby terminating managed stack crawls
+  // Push 0 as NULL Method* thereby terminating managed stack crawls.
   __ pushl(Immediate(0));
   if (!is_static) {
-    if (num_arg_array_bytes >= static_cast<size_t>(kPointerSize)) {
-      // Receiver already in EDX, pass 1st arg in ECX.
-      __ movl(ECX, Address(rArgArray, 0));
+    if (shorty_len > 1) {
+      // Receiver already in ECX, pass remaining 2 args in EDX and EBX.
+      __ movl(EDX, Address(rArgArray, 0));
+      if (shorty[1] == 'D' || shorty[1] == 'J') {
+        __ movl(EBX, Address(rArgArray, sizeof(JValue) / 2));
+      } else if (shorty_len > 2) {
+        __ movl(EBX, Address(rArgArray, sizeof(JValue)));
+      }
     }
   } else {
-    if (num_arg_array_bytes >= static_cast<size_t>(kPointerSize)) {
-      // Pass 1st arg in EDX.
-      __ movl(EDX, Address(rArgArray, 0));
-      if (num_arg_array_bytes >= static_cast<size_t>(2* kPointerSize)) {
-        // Pass 2nd arg (or second 32-bit chunk of a wide 1st arg) in ECX.
-        bool is_wide = (shorty[1] == 'D' || shorty[1] == 'J');
-        __ movl(ECX, Address(rArgArray, is_wide ? kPointerSize : 2 * kPointerSize));
+    if (shorty_len > 1) {
+      // Pass remaining 3 args in ECX, EDX and EBX.
+      __ movl(ECX, Address(rArgArray, 0));
+      if (shorty[1] == 'D' || shorty[1] == 'J') {
+        __ movl(EDX, Address(rArgArray, sizeof(JValue) / 2));
+        if (shorty_len > 2) {
+           __ movl(EBX, Address(rArgArray, sizeof(JValue)));
+        }
+      } else if (shorty_len > 2) {
+        __ movl(EDX, Address(rArgArray, sizeof(JValue)));
+        if (shorty[2] == 'D' || shorty[2] == 'J') {
+          __ movl(EBX, Address(rArgArray, sizeof(JValue) + (sizeof(JValue) / 2)));
+        } else {
+          __ movl(EBX, Address(rArgArray, sizeof(JValue) + sizeof(JValue)));
+        }
       }
     }
   }
 
   __ call(Address(EAX, Method::GetCodeOffset()));  // Call code off of method
 
-  // pop arguments up to the return address
-  __ addl(ESP, Immediate(frame_size + pad_size - kPointerSize));
+  // Pop arguments up to EBX and the return address.
+  __ addl(ESP, Immediate(frame_size + pad_size - (2 * kPointerSize)));
+  // Restore EBX.
+  __ popl(EBX);
   char ch = shorty[0];
   if (ch != 'V') {
     // Load the result JValue pointer.
diff --git a/src/runtime.cc b/src/runtime.cc
index 96c4451..b340317 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -968,8 +968,7 @@
     method->SetFpSpillMask(fp_spills);
   } else if (instruction_set == kX86) {
     method->SetFrameSizeInBytes(32);
-    method->SetCoreSpillMask((1 << art::x86::EBX) | (1 << art::x86::EBP) | (1 << art::x86::ESI) |
-                             (1 << art::x86::EDI));
+    method->SetCoreSpillMask((1 << art::x86::EBP) | (1 << art::x86::ESI) | (1 << art::x86::EDI));
     method->SetFpSpillMask(0);
   } else {
     UNIMPLEMENTED(FATAL);
diff --git a/src/runtime_support_x86.S b/src/runtime_support_x86.S
index e621eff..3d57d5d 100644
--- a/src/runtime_support_x86.S
+++ b/src/runtime_support_x86.S
@@ -18,7 +18,7 @@
     pushl %edi  // Save callee saves
     pushl %esi
     pushl %ebp
-    pushl %ebx
+    pushl $0
     pushl $0
     pushl $0
     pushl $0   // Will be clobbered to be Method*
diff --git a/src/stub_x86.cc b/src/stub_x86.cc
index 1820f5f..845c179 100644
--- a/src/stub_x86.cc
+++ b/src/stub_x86.cc
@@ -49,7 +49,7 @@
   __ pushl(EDI);
   __ pushl(ESI);
   __ pushl(EBP);
-  __ pushl(EBX);
+  __ pushl(Immediate(0));
   __ pushl(Immediate(0));
   __ pushl(Immediate(0));
   __ pushl(Immediate(0));  // <-- callee save Method* to go here