Merge "test-art-host-oat targets should not rerun dex2oat" into ics-mr1-plus-art
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 41edefa..28ac65b 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -104,7 +104,6 @@
 LIBART_COMMON_SRC_FILES := \
 	src/atomic.cc.arm \
 	src/card_table.cc \
-	src/constants.cc \
 	src/check_jni.cc \
 	src/class_linker.cc \
 	src/class_loader.cc \
@@ -268,6 +267,12 @@
 $(error unsupported HOST_ARCH=$(HOST_ARCH))
 endif # HOST_ARCH != x86
 
+LIBART_ENUM_OPERATOR_OUT_HEADER_FILES := \
+	src/indirect_reference_table.h \
+	src/instruction_set.h \
+	src/invoke_type.h \
+	src/mutex.h
+
 LIBARTTEST_COMMON_SRC_FILES := \
 	test/StackWalk/stack_walk_jni.cc \
 	test/ReferenceMap/stack_walk_refmap_jni.cc
diff --git a/build/Android.libart.mk b/build/Android.libart.mk
index 5406c9e..c7a85ef 100644
--- a/build/Android.libart.mk
+++ b/build/Android.libart.mk
@@ -54,8 +54,20 @@
     LOCAL_SRC_FILES := $(LIBART_TARGET_SRC_FILES)
   else # host
     LOCAL_SRC_FILES := $(LIBART_HOST_SRC_FILES)
+    LOCAL_IS_HOST_MODULE := true
   endif
 
+  GENERATED_SRC_DIR := $$(call intermediates-dir-for,$$(LOCAL_MODULE_CLASS),$$(LOCAL_MODULE),$$(LOCAL_IS_HOST_MODULE),)
+  ENUM_OPERATOR_OUT_CC_FILES := $$(patsubst %.h,%_operator_out.cc,$$(LIBART_ENUM_OPERATOR_OUT_HEADER_FILES))
+  ENUM_OPERATOR_OUT_GEN := $$(addprefix $$(GENERATED_SRC_DIR)/,$$(ENUM_OPERATOR_OUT_CC_FILES))
+
+$$(ENUM_OPERATOR_OUT_GEN): art/tools/generate-operator-out.py
+$$(ENUM_OPERATOR_OUT_GEN): PRIVATE_CUSTOM_TOOL = art/tools/generate-operator-out.py $$< > $$@
+$$(ENUM_OPERATOR_OUT_GEN): $$(GENERATED_SRC_DIR)/%_operator_out.cc : art/%.h
+	$$(transform-generated-source)
+
+  LOCAL_GENERATED_SOURCES += $$(ENUM_OPERATOR_OUT_GEN)
+
   LOCAL_CFLAGS := $(LIBART_CFLAGS)
   ifeq ($$(art_target_or_host),target)
     LOCAL_CFLAGS += $(ART_TARGET_CFLAGS)
@@ -102,7 +114,6 @@
     endif
     include $(BUILD_SHARED_LIBRARY)
   else # host
-    LOCAL_IS_HOST_MODULE := true
     ifeq ($(ART_USE_LLVM_COMPILER),true)
       include $(LLVM_GEN_INTRINSICS_MK)
       include $(LLVM_HOST_BUILD_MK)
diff --git a/src/compiler/codegen/CodegenUtil.cc b/src/compiler/codegen/CodegenUtil.cc
index 20eb47f..00e78ec 100644
--- a/src/compiler/codegen/CodegenUtil.cc
+++ b/src/compiler/codegen/CodegenUtil.cc
@@ -657,6 +657,8 @@
          */
 #if defined(TARGET_ARM)
         int bxOffset = tabRec->anchor->offset + 4;
+#elif defined(TARGET_X86)
+        int bxOffset = 0;
 #else
         int bxOffset = tabRec->anchor->offset;
 #endif
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index 3cc594c..aeacab8 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -43,6 +43,20 @@
 #endif
 }
 
+void callRuntimeHelperReg(CompilationUnit* cUnit, int helperOffset, int arg0) {
+#if !defined(TARGET_X86)
+    int rTgt = loadHelper(cUnit, helperOffset);
+#endif
+    opRegCopy(cUnit, rARG0, arg0);
+    oatClobberCalleeSave(cUnit);
+#if !defined(TARGET_X86)
+    opReg(cUnit, kOpBlx, rTgt);
+    oatFreeTemp(cUnit, rTgt);
+#else
+    opThreadMem(cUnit, kOpBlx, helperOffset);
+#endif
+}
+
 void callRuntimeHelperRegLocation(CompilationUnit* cUnit, int helperOffset,
                                   RegLocation arg0) {
 #if !defined(TARGET_X86)
@@ -431,7 +445,7 @@
             cond = (ConditionCode)0;
             LOG(FATAL) << "Unexpected opcode " << (int)opcode;
     }
-#if defined(TARGET_MIPS)
+#if defined(TARGET_MIPS) || defined(TARGET_X86)
     opCmpImmBranch(cUnit, cond, rlSrc.lowReg, 0, &labelList[bb->taken->id]);
 #else
     opRegImm(cUnit, kOpCmp, rlSrc.lowReg, 0);
@@ -1811,37 +1825,40 @@
                 (int)mir->dalvikInsn.opcode;
     }
     if (!callOut) {
-        rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
         if (unary) {
+            rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
             rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
             opRegReg(cUnit, op, rlResult.lowReg,
                      rlSrc1.lowReg);
         } else {
-            rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
-#if defined(TARGET_X86)
-            rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-            opRegRegReg(cUnit, op, rlResult.lowReg,
-                        rlSrc1.lowReg, rlSrc2.lowReg);
-#else
             if (shiftOp) {
+#if !defined(TARGET_X86)
+                rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
                 int tReg = oatAllocTemp(cUnit);
                 opRegRegImm(cUnit, kOpAnd, tReg, rlSrc2.lowReg, 31);
+#else
+                // X86 doesn't require masking and must use ECX
+                loadValueDirectFixed(cUnit, rlSrc2, rCX);
+                int tReg = rCX;
+#endif
+                rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
                 rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
                 opRegRegReg(cUnit, op, rlResult.lowReg,
                             rlSrc1.lowReg, tReg);
                 oatFreeTemp(cUnit, tReg);
             } else {
+                rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
+                rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
                 rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
                 opRegRegReg(cUnit, op, rlResult.lowReg,
                             rlSrc1.lowReg, rlSrc2.lowReg);
             }
-#endif
         }
         storeValue(cUnit, rlDest, rlResult);
     } else {
         RegLocation rlResult;
         oatFlushAllRegs(cUnit);   /* Send everything to home location */
-        loadValueDirectFixed(cUnit, rlSrc2, rRET1);
+        loadValueDirectFixed(cUnit, rlSrc2, rARG1);
 #if !defined(TARGET_X86)
         int rTgt = loadHelper(cUnit, funcOffset);
 #endif
@@ -2151,12 +2168,8 @@
             break;
         case Instruction::ADD_LONG:
         case Instruction::ADD_LONG_2ADDR:
-#if defined(TARGET_MIPS)
+#if defined(TARGET_MIPS) || defined(TARGET_X86)
             return genAddLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
-#elif defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLadd);
 #else
             firstOp = kOpAdd;
             secondOp = kOpAdc;
@@ -2164,16 +2177,13 @@
 #endif
         case Instruction::SUB_LONG:
         case Instruction::SUB_LONG_2ADDR:
-#if defined(TARGET_MIPS)
+#if defined(TARGET_MIPS) || defined(TARGET_X86)
             return genSubLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
-#elif defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLsub);
-#endif
+#else
             firstOp = kOpSub;
             secondOp = kOpSbc;
             break;
+#endif
         case Instruction::MUL_LONG:
         case Instruction::MUL_LONG_2ADDR:
             callOut = true;
@@ -2187,45 +2197,45 @@
             retReg = rRET0;
             funcOffset = ENTRYPOINT_OFFSET(pLdivmod);
             break;
-        /* NOTE - result is in rARG2/rARG3 instead of rRET0/rRET1 */
-        // FIXME: is true, or could be made true, or other targets?
         case Instruction::REM_LONG:
         case Instruction::REM_LONG_2ADDR:
             callOut = true;
             checkZero = true;
-            funcOffset = ENTRYPOINT_OFFSET(pLdivmod);
+            funcOffset = ENTRYPOINT_OFFSET(pLdiv);
+#if defined(TARGET_ARM)
+            /* NOTE - result is in rARG2/rARG3 instead of rRET0/rRET1 */
             retReg = rARG2;
+#else
+            retReg = rRET0;
+#endif
             break;
         case Instruction::AND_LONG_2ADDR:
         case Instruction::AND_LONG:
 #if defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLand);
-#endif
+            return genAndLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
+#else
             firstOp = kOpAnd;
             secondOp = kOpAnd;
             break;
+#endif
         case Instruction::OR_LONG:
         case Instruction::OR_LONG_2ADDR:
 #if defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLor);
-#endif
+            return genOrLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
+#else
             firstOp = kOpOr;
             secondOp = kOpOr;
             break;
+#endif
         case Instruction::XOR_LONG:
         case Instruction::XOR_LONG_2ADDR:
 #if defined(TARGET_X86)
-            callOut = true;
-            retReg = rRET0;
-            funcOffset = ENTRYPOINT_OFFSET(pLxor);
-#endif
+            return genXorLong(cUnit, mir, rlDest, rlSrc1, rlSrc2);
+#else
             firstOp = kOpXor;
             secondOp = kOpXor;
             break;
+#endif
         case Instruction::NEG_LONG: {
             return genNegLong(cUnit, mir, rlDest, rlSrc2);
         }
diff --git a/src/compiler/codegen/GenInvoke.cc b/src/compiler/codegen/GenInvoke.cc
index 8a9d1f5..a904419 100644
--- a/src/compiler/codegen/GenInvoke.cc
+++ b/src/compiler/codegen/GenInvoke.cc
@@ -286,15 +286,15 @@
      * This handles the case in which the base method is not fully
      * resolved at compile time, we bail to a runtime helper.
      */
-#if !defined(TARGET_X86)
     if (state == 0) {
+#if !defined(TARGET_X86)
         // Load trampoline target
         loadWordDisp(cUnit, rSELF, trampoline, rINVOKE_TGT);
+#endif
         // Load rARG0 with method index
         loadConstant(cUnit, rARG0, dexIdx);
         return 1;
     }
-#endif
     return -1;
 }
 
@@ -357,11 +357,7 @@
                 uint32_t methodIdx, uintptr_t directCode,
                 uintptr_t directMethod, InvokeType type, bool skipThis)
 {
-#if !defined(TARGET_X86)
     int lastArgReg = rARG3;
-#else
-    int lastArgReg = rARG2;
-#endif
     int nextReg = rARG1;
     int nextArg = 0;
     if (skipThis) {
diff --git a/src/compiler/codegen/MethodCodegenDriver.cc b/src/compiler/codegen/MethodCodegenDriver.cc
index 5ffe3e4..0b8a19d 100644
--- a/src/compiler/codegen/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/MethodCodegenDriver.cc
@@ -128,11 +128,31 @@
 #if !defined(TARGET_X86)
     opReg(cUnit, kOpBlx, rINVOKE_TGT);
 #else
-    if (fastPath) {
+    if (fastPath && type != kInterface) {
       opMem(cUnit, kOpBlx, rARG0, Method::GetCodeOffset().Int32Value());
     } else {
-      UNIMPLEMENTED(FATAL) << "compute trampoline";
-      opThreadMem(cUnit, kOpBlx, 0);
+      int trampoline = 0;
+      switch (type) {
+        case kInterface:
+          trampoline = fastPath ? ENTRYPOINT_OFFSET(pInvokeInterfaceTrampoline)
+                                : ENTRYPOINT_OFFSET(pInvokeInterfaceTrampolineWithAccessCheck);
+          break;
+        case kDirect:
+          trampoline = ENTRYPOINT_OFFSET(pInvokeDirectTrampolineWithAccessCheck);
+          break;
+        case kStatic:
+          trampoline = ENTRYPOINT_OFFSET(pInvokeStaticTrampolineWithAccessCheck);
+          break;
+        case kSuper:
+          trampoline = ENTRYPOINT_OFFSET(pInvokeSuperTrampolineWithAccessCheck);
+          break;
+        case kVirtual:
+          trampoline = ENTRYPOINT_OFFSET(pInvokeVirtualTrampolineWithAccessCheck);
+          break;
+        default:
+          LOG(FATAL) << "Unexpected invoke type";
+      }
+      opThreadMem(cUnit, kOpBlx, trampoline);
     }
 #endif
 
@@ -375,7 +395,7 @@
             break;
 
         case Instruction::SPARSE_SWITCH:
-            genSparseSwitch(cUnit, mir, rlSrc[0]);
+            genSparseSwitch(cUnit, mir, rlSrc[0], labelList);
             break;
 
         case Instruction::CMPL_FLOAT:
diff --git a/src/compiler/codegen/arm/Thumb2/Gen.cc b/src/compiler/codegen/arm/Thumb2/Gen.cc
index f485403..44cae0f 100644
--- a/src/compiler/codegen/arm/Thumb2/Gen.cc
+++ b/src/compiler/codegen/arm/Thumb2/Gen.cc
@@ -369,7 +369,8 @@
  *   add   rPC, rDisp   ; This is the branch from which we compute displacement
  *   cbnz  rIdx, lp
  */
-void genSparseSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
+void genSparseSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc,
+                     LIR* labelList)
 {
     const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
     if (cUnit->printMe) {
diff --git a/src/compiler/codegen/mips/Mips32/Gen.cc b/src/compiler/codegen/mips/Mips32/Gen.cc
index b810f98..ade2fd8 100644
--- a/src/compiler/codegen/mips/Mips32/Gen.cc
+++ b/src/compiler/codegen/mips/Mips32/Gen.cc
@@ -63,7 +63,8 @@
  * done:
  *
  */
-void genSparseSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
+void genSparseSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc,
+                     LIR* labelList)
 {
     const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
     if (cUnit->printMe) {
diff --git a/src/compiler/codegen/x86/ArchFactory.cc b/src/compiler/codegen/x86/ArchFactory.cc
index bd95afb..043d66e 100644
--- a/src/compiler/codegen/x86/ArchFactory.cc
+++ b/src/compiler/codegen/x86/ArchFactory.cc
@@ -24,30 +24,94 @@
 
 namespace art {
 
+bool genAddLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc2, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpAdd, r0, r2);  // r0 = r0 + r2
+  opRegReg(cUnit, kOpAdc, r1, r3);  // r1 = r1 + r3 + CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
+bool genSubLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc2, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpSub, r0, r2);  // r0 = r0 - r2
+  opRegReg(cUnit, kOpSbc, r1, r3);  // r1 = r1 - r3 - CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
+bool genAndLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc2, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpAnd, r0, r2);  // r0 = r0 - r2
+  opRegReg(cUnit, kOpAnd, r1, r3);  // r1 = r1 - r3 - CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
+bool genOrLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc2, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpOr, r0, r2);  // r0 = r0 - r2
+  opRegReg(cUnit, kOpOr, r1, r3);  // r1 = r1 - r3 - CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
+bool genXorLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2)
+{
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+  loadValueDirectWideFixed(cUnit, rlSrc2, r2, r3);
+  // Compute (r1:r0) = (r1:r0) + (r2:r3)
+  opRegReg(cUnit, kOpXor, r0, r2);  // r0 = r0 - r2
+  opRegReg(cUnit, kOpXor, r1, r3);  // r1 = r1 - r3 - CF
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
+}
+
 bool genNegLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
                 RegLocation rlSrc)
 {
-    UNIMPLEMENTED(WARNING) << "genNegLong";
-#if 0
-    rlSrc = loadValueWide(cUnit, rlSrc, kCoreReg);
-    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-    /*
-     *  [v1 v0] =  -[a1 a0]
-     *    negu    v0,a0
-     *    negu    v1,a1
-     *    sltu    t1,r_zero
-     *    subu    v1,v1,t1
-     */
-
-    opRegReg(cUnit, kOpNeg, rlResult.lowReg, rlSrc.lowReg);
-    opRegReg(cUnit, kOpNeg, rlResult.highReg, rlSrc.highReg);
-    int tReg = oatAllocTemp(cUnit);
-    newLIR3(cUnit, kX86Sltu, tReg, r_ZERO, rlResult.lowReg);
-    opRegRegReg(cUnit, kOpSub, rlResult.highReg, rlResult.highReg, tReg);
-    oatFreeTemp(cUnit, tReg);
-    storeValueWide(cUnit, rlDest, rlResult);
-#endif
-    return false;
+  oatFlushAllRegs(cUnit);
+  oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+  loadValueDirectWideFixed(cUnit, rlSrc, r0, r1);
+  // Compute (r1:r0) = -(r1:r0)
+  opRegReg(cUnit, kOpNeg, r0, r0);  // r0 = -r0
+  opRegImm(cUnit, kOpAdc, r1, 0);   // r1 = r1 + CF
+  opRegReg(cUnit, kOpNeg, r1, r1);  // r1 = -r1
+  RegLocation rlResult = {kLocPhysReg, 1, 0, 0,  0, 0, 1, r0, r1, INVALID_SREG};
+  storeValueWide(cUnit, rlDest, rlResult);
+  return false;
 }
 
 void genDebuggerUpdate(CompilationUnit* cUnit, int32_t offset);
diff --git a/src/compiler/codegen/x86/ArchUtility.cc b/src/compiler/codegen/x86/ArchUtility.cc
index 6c54e34..d325f5c 100644
--- a/src/compiler/codegen/x86/ArchUtility.cc
+++ b/src/compiler/codegen/x86/ArchUtility.cc
@@ -49,7 +49,7 @@
 
 /*
  * Interpret a format string and build a string no longer than size
- * See format key in Assemble.c.
+ * See format key in Assemble.cc.
  */
 std::string buildInsnString(const char *fmt, LIR *lir, unsigned char* baseAddr) {
   std::string buf;
@@ -79,6 +79,11 @@
           case 'd':
             buf += StringPrintf("%d", operand);
             break;
+          case 'p': {
+            SwitchTable *tabRec = reinterpret_cast<SwitchTable*>(operand);
+            buf += StringPrintf("0x%08x", tabRec->offset);
+            break;
+          }
           case 'r':
             if (FPREG(operand) || DOUBLEREG(operand)) {
               int fp_reg = operand & FP_REG_MASK;
diff --git a/src/compiler/codegen/x86/Assemble.cc b/src/compiler/codegen/x86/Assemble.cc
index d1a8d64..2639057 100644
--- a/src/compiler/codegen/x86/Assemble.cc
+++ b/src/compiler/codegen/x86/Assemble.cc
@@ -26,7 +26,7 @@
 
 X86EncodingMap EncodingMap[kX86Last] = {
   { kX8632BitData, kData,    IS_UNARY_OP,            { 0, 0, 0x00, 0, 0, 0, 0, 4 }, "data",  "0x!0d" },
-  { kX86Bkpt,      kNullary, NO_OPERAND | IS_BRANCH, { 0, 0, 0xCC, 0, 0, 0, 0, 4 }, "int 3", "" },
+  { kX86Bkpt,      kNullary, NO_OPERAND | IS_BRANCH, { 0, 0, 0xCC, 0, 0, 0, 0, 0 }, "int 3", "" },
   { kX86Nop,       kNop,     IS_UNARY_OP,            { 0, 0, 0x90, 0, 0, 0, 0, 0 }, "nop",   "" },
 
 #define ENCODING_MAP(opname, is_store, \
@@ -197,17 +197,16 @@
 { kX86 ## opname ## 32RI, kShiftRegImm,   IS_BINARY_OP   | SETS_CCODES, { 0, 0,    0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "32RI", "!0r,!1d" }, \
 { kX86 ## opname ## 32MI, kShiftMemImm,   IS_TERTIARY_OP | SETS_CCODES, { 0, 0,    0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "32MI", "[!0r+!1d],!2r" }, \
 { kX86 ## opname ## 32AI, kShiftArrayImm, IS_QUIN_OP     | SETS_CCODES, { 0, 0,    0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "32AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
-{ kX86 ## opname ## 32RC, kShiftRegCl,    IS_BINARY_OP   | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    1 }, #opname "32RC", "" }, \
-{ kX86 ## opname ## 32MC, kShiftMemCl,    IS_TERTIARY_OP | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    1 }, #opname "32MC", "" }, \
-{ kX86 ## opname ## 32AC, kShiftArrayCl,  IS_QUIN_OP     | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    1 }, #opname "32AC", "" }
+{ kX86 ## opname ## 32RC, kShiftRegCl,    IS_BINARY_OP   | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32RC", "" }, \
+{ kX86 ## opname ## 32MC, kShiftMemCl,    IS_TERTIARY_OP | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32MC", "" }, \
+{ kX86 ## opname ## 32AC, kShiftArrayCl,  IS_QUIN_OP     | SETS_CCODES, { 0, 0,    0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32AC", "" }
 
   SHIFT_ENCODING_MAP(Rol, 0x0),
   SHIFT_ENCODING_MAP(Ror, 0x1),
   SHIFT_ENCODING_MAP(Rcl, 0x2),
   SHIFT_ENCODING_MAP(Rcr, 0x3),
   SHIFT_ENCODING_MAP(Sal, 0x4),
-  SHIFT_ENCODING_MAP(Shl, 0x5),
-  SHIFT_ENCODING_MAP(Shr, 0x6),
+  SHIFT_ENCODING_MAP(Shr, 0x5),
   SHIFT_ENCODING_MAP(Sar, 0x7),
 #undef SHIFT_ENCODING_MAP
 
@@ -295,11 +294,16 @@
   { kX86Jcc32, kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0x0F, 0x80, 0, 0, 0, 0 }, "Jcc32", "!1c !0t" },
   { kX86Jmp8,  kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0xEB, 0,    0, 0, 0, 0 }, "Jmp8",  "!0t" },
   { kX86Jmp32, kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP, { 0,             0, 0xE9, 0,    0, 0, 0, 0 }, "Jmp32", "!0t" },
-  { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH,               { 0,             0, 0xE8, 0, 0, 0, 0, 0 }, "CallR", "!0r" },
-  { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0, 0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
-  { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0, 0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
-  { kX86CallT, kCall, IS_UNARY_OP  | IS_BRANCH | IS_LOAD,     { THREAD_PREFIX, 0, 0xFF, 0, 0, 2, 0, 0 }, "CallT", "fs:[!0d]" },
-  { kX86Ret,   kNullary,NO_OPERAND | IS_BRANCH,               { 0,             0, 0xC3, 0, 0, 0, 0, 0 }, "Ret", "" },
+  { kX86JmpR,  kJmp,  IS_UNARY_OP  | IS_BRANCH,               { 0,             0, 0xFF, 0,    0, 4, 0, 0 }, "JmpR",  "!0r" },
+  { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH,               { 0,             0, 0xE8, 0,    0, 0, 0, 0 }, "CallR", "!0r" },
+  { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
+  { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD,     { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
+  { kX86CallT, kCall, IS_UNARY_OP  | IS_BRANCH | IS_LOAD,     { THREAD_PREFIX, 0, 0xFF, 0,    0, 2, 0, 0 }, "CallT", "fs:[!0d]" },
+  { kX86Ret,   kNullary,NO_OPERAND | IS_BRANCH,               { 0,             0, 0xC3, 0,    0, 0, 0, 0 }, "Ret", "" },
+
+  { kX86StartOfMethod, kMacro,  IS_UNARY_OP | SETS_CCODES, { 0,0,0,0,0,0,0,0 },           "StartOfMethod", "!0r" },
+  { kX86PcRelLoadRA,   kPcRel,  IS_LOAD | IS_QUIN_OP,      { 0, 0, 0x8B, 0, 0, 0, 0, 0 }, "PcRelLoadRA",   "!0r,[!1r+!2r<<!3d+!4p]" },
+  { kX86PcRelAdr,      kPcRel,  IS_LOAD | IS_BINARY_OP,    { 0, 0, 0xB8, 0, 0, 0, 0, 4 }, "PcRelAdr",      "!0r,!1d" },
 };
 
 static size_t computeSize(X86EncodingMap* entry, int displacement, bool has_sib) {
@@ -323,7 +327,7 @@
   }
   if (displacement != 0) {
     if (entry->opcode != kX86Lea32RA) {
-      DCHECK_NE(entry->flags & (IS_LOAD | IS_STORE), 0);
+      DCHECK_NE(entry->flags & (IS_LOAD | IS_STORE), 0) << entry->name;
     }
     size += IS_SIMM8(displacement) ? 1 : 4;
   }
@@ -428,9 +432,11 @@
     case kJmp:
       if (lir->opcode == kX86Jmp8) {
         return 2;  // opcode + rel8
-      } else {
-        DCHECK(lir->opcode == kX86Jmp32);
+      } else if (lir->opcode == kX86Jmp32) {
         return 5;  // opcode + rel32
+      } else {
+        DCHECK(lir->opcode == kX86JmpR);
+        return 2;  // opcode + modrm
       }
     case kCall:
       switch (lir->opcode) {
@@ -445,6 +451,19 @@
           break;
       }
       break;
+    case kPcRel:
+      if (entry->opcode == kX86PcRelLoadRA) {
+        // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
+        return computeSize(entry, 0x12345678, true);
+      } else {
+        DCHECK(entry->opcode == kX86PcRelAdr);
+        return 5; // opcode with reg + 4 byte immediate
+      }
+    case kMacro:
+      DCHECK_EQ(lir->opcode, static_cast<int>(kX86StartOfMethod));
+      return 5 /* call opcode + 4 byte displacement */ + 1 /* pop reg */ +
+          computeSize(&EncodingMap[kX86Sub32RI], 0, false) -
+          (lir->operands[0] == rAX  ? 1 : 0);  // shorter ax encoding
     default:
       break;
   }
@@ -802,7 +821,7 @@
 }
 
 static void emitShiftRegImm(CompilationUnit* cUnit, const X86EncodingMap* entry,
-                       uint8_t reg, int imm) {
+                            uint8_t reg, int imm) {
   if (entry->skeleton.prefix1 != 0) {
     cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
     if (entry->skeleton.prefix2 != 0) {
@@ -829,7 +848,7 @@
     DCHECK_EQ(0, entry->skeleton.extra_opcode2);
   }
   DCHECK_LT(reg, 8);
-  uint8_t modrm = (0 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+  uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
   cUnit->codeBuffer.push_back(modrm);
   if (imm != 1) {
     DCHECK_EQ(entry->skeleton.immediate_bytes, 1);
@@ -838,18 +857,67 @@
   }
 }
 
+static void emitShiftRegCl(CompilationUnit* cUnit, const X86EncodingMap* entry,
+                           uint8_t reg, uint8_t cl) {
+  DCHECK_EQ(cl, static_cast<uint8_t>(rCX));
+  if (entry->skeleton.prefix1 != 0) {
+    cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
+    if (entry->skeleton.prefix2 != 0) {
+      cUnit->codeBuffer.push_back(entry->skeleton.prefix2);
+    }
+  } else {
+    DCHECK_EQ(0, entry->skeleton.prefix2);
+  }
+  cUnit->codeBuffer.push_back(entry->skeleton.opcode);
+  DCHECK_EQ(0, entry->skeleton.extra_opcode1);
+  DCHECK_EQ(0, entry->skeleton.extra_opcode2);
+  DCHECK_LT(reg, 8);
+  uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+  cUnit->codeBuffer.push_back(modrm);
+  DCHECK_EQ(0, entry->skeleton.ax_opcode);
+  DCHECK_EQ(0, entry->skeleton.immediate_bytes);
+}
+
+static void emitRegCond(CompilationUnit* cUnit, const X86EncodingMap* entry,
+                       uint8_t reg, uint8_t condition) {
+  if (entry->skeleton.prefix1 != 0) {
+    cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
+    if (entry->skeleton.prefix2 != 0) {
+      cUnit->codeBuffer.push_back(entry->skeleton.prefix2);
+    }
+  } else {
+    DCHECK_EQ(0, entry->skeleton.prefix2);
+  }
+  DCHECK_EQ(0, entry->skeleton.ax_opcode);
+  DCHECK_EQ(0x0F, entry->skeleton.opcode);
+  cUnit->codeBuffer.push_back(0x0F);
+  DCHECK_EQ(0x90, entry->skeleton.extra_opcode1);
+  cUnit->codeBuffer.push_back(0x90 | condition);
+  DCHECK_EQ(0, entry->skeleton.extra_opcode2);
+  DCHECK_LT(reg, 8);
+  uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+  cUnit->codeBuffer.push_back(modrm);
+  DCHECK_EQ(entry->skeleton.immediate_bytes, 0);
+}
+
 static void emitJmp(CompilationUnit* cUnit, const X86EncodingMap* entry, int rel) {
   if (entry->opcode == kX86Jmp8) {
     DCHECK(IS_SIMM8(rel));
     cUnit->codeBuffer.push_back(0xEB);
     cUnit->codeBuffer.push_back(rel & 0xFF);
-  } else {
-    DCHECK(entry->opcode == kX86Jmp32);
+  } else if (entry->opcode == kX86Jmp32) {
     cUnit->codeBuffer.push_back(0xE9);
     cUnit->codeBuffer.push_back(rel & 0xFF);
     cUnit->codeBuffer.push_back((rel >> 8) & 0xFF);
     cUnit->codeBuffer.push_back((rel >> 16) & 0xFF);
     cUnit->codeBuffer.push_back((rel >> 24) & 0xFF);
+  } else {
+    DCHECK(entry->opcode == kX86JmpR);
+    cUnit->codeBuffer.push_back(entry->skeleton.opcode);
+    uint8_t reg = static_cast<uint8_t>(rel);
+    DCHECK_LT(reg, 8);
+    uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
+    cUnit->codeBuffer.push_back(modrm);
   }
 }
 
@@ -932,6 +1000,68 @@
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
 }
 
+static void emitPcRel(CompilationUnit* cUnit, const X86EncodingMap* entry, uint8_t reg,
+                      int base_or_table, uint8_t index, int scale, int table_or_disp) {
+  int disp;
+  if (entry->opcode == kX86PcRelLoadRA) {
+    SwitchTable *tabRec = (SwitchTable*)table_or_disp;
+    disp = tabRec->offset;
+  } else {
+    DCHECK(entry->opcode == kX86PcRelAdr);
+    FillArrayData *tabRec = (FillArrayData *)base_or_table;
+    disp = tabRec->offset;
+  }
+  if (entry->skeleton.prefix1 != 0) {
+    cUnit->codeBuffer.push_back(entry->skeleton.prefix1);
+    if (entry->skeleton.prefix2 != 0) {
+      cUnit->codeBuffer.push_back(entry->skeleton.prefix2);
+    }
+  } else {
+    DCHECK_EQ(0, entry->skeleton.prefix2);
+  }
+  if (FPREG(reg)) {
+    reg = reg & FP_REG_MASK;
+  }
+  DCHECK_LT(reg, 8);
+  if (entry->opcode == kX86PcRelLoadRA) {
+    cUnit->codeBuffer.push_back(entry->skeleton.opcode);
+    DCHECK_EQ(0, entry->skeleton.extra_opcode1);
+    DCHECK_EQ(0, entry->skeleton.extra_opcode2);
+    uint8_t modrm = (2 << 6) | (reg << 3) | rSP;
+    cUnit->codeBuffer.push_back(modrm);
+    DCHECK_LT(scale, 4);
+    DCHECK_LT(index, 8);
+    DCHECK_LT(base_or_table, 8);
+    uint8_t base = static_cast<uint8_t>(base_or_table);
+    uint8_t sib = (scale << 6) | (index << 3) | base;
+    cUnit->codeBuffer.push_back(sib);
+    DCHECK_EQ(0, entry->skeleton.immediate_bytes);
+  } else {
+    cUnit->codeBuffer.push_back(entry->skeleton.opcode + reg);
+  }
+  cUnit->codeBuffer.push_back(disp & 0xFF);
+  cUnit->codeBuffer.push_back((disp >> 8) & 0xFF);
+  cUnit->codeBuffer.push_back((disp >> 16) & 0xFF);
+  cUnit->codeBuffer.push_back((disp >> 24) & 0xFF);
+  DCHECK_EQ(0, entry->skeleton.modrm_opcode);
+  DCHECK_EQ(0, entry->skeleton.ax_opcode);
+}
+
+static void emitMacro(CompilationUnit* cUnit, const X86EncodingMap* entry,
+                      uint8_t reg, int offset) {
+  DCHECK(entry->opcode == kX86StartOfMethod) << entry->name;
+  cUnit->codeBuffer.push_back(0xE8);  // call +0
+  cUnit->codeBuffer.push_back(0);
+  cUnit->codeBuffer.push_back(0);
+  cUnit->codeBuffer.push_back(0);
+  cUnit->codeBuffer.push_back(0);
+
+  DCHECK_LT(reg, 8);
+  cUnit->codeBuffer.push_back(0x58 + reg);  // pop reg
+
+  emitRegImm(cUnit, &EncodingMap[kX86Sub32RI], reg, offset + 5 /* size of call +0 */);
+}
+
 void emitUnimplemented(CompilationUnit* cUnit, const X86EncodingMap* entry, LIR* lir) {
   UNIMPLEMENTED(WARNING) << "encoding for: " << entry->name;
   for (int i = 0; i < oatGetInsnSize(lir); ++i) {
@@ -949,7 +1079,7 @@
   LIR *lir;
   AssemblerStatus res = kSuccess;  // Assume success
 
-  const bool kVerbosePcFixup = false;
+  const bool kVerbosePcFixup = cUnit->method_idx == 9703;
   for (lir = (LIR *) cUnit->firstLIRInsn; lir; lir = NEXT_LIR(lir)) {
     if (lir->opcode < 0) {
       continue;
@@ -982,6 +1112,29 @@
             oatSetupResourceMasks(lir);
             res = kRetryAll;
           }
+          if (kVerbosePcFixup) {
+            LOG(INFO) << "Source:";
+            oatDumpLIRInsn(cUnit, lir, 0);
+            LOG(INFO) << "Target:";
+            oatDumpLIRInsn(cUnit, targetLIR, 0);
+            LOG(INFO) << "Delta " << delta;
+          }
+          lir->operands[0] = delta;
+          break;
+        }
+        case kX86Jcc32: {
+          LIR *targetLIR = lir->target;
+          DCHECK(targetLIR != NULL);
+          intptr_t pc = lir->offset + 6 /* 2 byte opcode + rel32 */;
+          intptr_t target = targetLIR->offset;
+          int delta = target - pc;
+          if (kVerbosePcFixup) {
+            LOG(INFO) << "Source:";
+            oatDumpLIRInsn(cUnit, lir, 0);
+            LOG(INFO) << "Target:";
+            oatDumpLIRInsn(cUnit, targetLIR, 0);
+            LOG(INFO) << "Delta " << delta;
+          }
           lir->operands[0] = delta;
           break;
         }
@@ -1015,6 +1168,15 @@
           lir->operands[0] = delta;
           break;
         }
+        case kX86Jmp32: {
+          LIR *targetLIR = lir->target;
+          DCHECK(targetLIR != NULL);
+          intptr_t pc = lir->offset + 5 /* opcode + rel32 */;
+          intptr_t target = targetLIR->offset;
+          int delta = target - pc;
+          lir->operands[0] = delta;
+          break;
+        }
         default:
           break;
       }
@@ -1028,6 +1190,7 @@
     if (res != kSuccess) {
       continue;
     }
+    CHECK_EQ(static_cast<size_t>(lir->offset), cUnit->codeBuffer.size());
     const X86EncodingMap *entry = &EncodingMap[lir->opcode];
     size_t starting_cbuf_size = cUnit->codeBuffer.size();
     switch (entry->kind) {
@@ -1088,6 +1251,12 @@
       case kShiftRegImm:  // lir operands - 0: reg, 1: immediate
         emitShiftRegImm(cUnit, entry, lir->operands[0], lir->operands[1]);
         break;
+      case kShiftRegCl: // lir operands - 0: reg, 1: cl
+        emitShiftRegCl(cUnit, entry, lir->operands[0], lir->operands[1]);
+        break;
+      case kRegCond:  // lir operands - 0: reg, 1: condition
+        emitRegCond(cUnit, entry, lir->operands[0], lir->operands[1]);
+        break;
       case kJmp:  // lir operands - 0: rel
         emitJmp(cUnit, entry, lir->operands[0]);
         break;
@@ -1107,15 +1276,20 @@
             break;
         }
         break;
+      case kPcRel:  // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
+        emitPcRel(cUnit, entry, lir->operands[0], lir->operands[1], lir->operands[2],
+                  lir->operands[3], lir->operands[4]);
+        break;
+      case kMacro:
+        emitMacro(cUnit, entry, lir->operands[0], lir->offset);
+        break;
       default:
         emitUnimplemented(cUnit, entry, lir);
         break;
     }
-    if (entry->kind != kJcc && entry->kind != kJmp) {
-      CHECK_EQ(static_cast<size_t>(oatGetInsnSize(lir)),
-               cUnit->codeBuffer.size() - starting_cbuf_size)
-          << "Instruction size mismatch for entry: " << EncodingMap[lir->opcode].name;
-    }
+    CHECK_EQ(static_cast<size_t>(oatGetInsnSize(lir)),
+             cUnit->codeBuffer.size() - starting_cbuf_size)
+        << "Instruction size mismatch for entry: " << EncodingMap[lir->opcode].name;
   }
   return res;
 }
diff --git a/src/compiler/codegen/x86/Codegen.h b/src/compiler/codegen/x86/Codegen.h
index 178b986..52ba7c1 100644
--- a/src/compiler/codegen/x86/Codegen.h
+++ b/src/compiler/codegen/x86/Codegen.h
@@ -31,6 +31,12 @@
                 RegLocation rlSrc1, RegLocation rlSrc2);
 bool genSubLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
                 RegLocation rlSrc1, RegLocation rlSrc2);
+bool genAndLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2);
+bool genOrLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2);
+bool genXorLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
+                RegLocation rlSrc1, RegLocation rlSrc2);
 bool genNegLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
                 RegLocation rlSrc);
 LIR *opRegImm(CompilationUnit* cUnit, OpKind op, int rDestSrc1, int value);
diff --git a/src/compiler/codegen/x86/X86/Factory.cc b/src/compiler/codegen/x86/X86/Factory.cc
index 4987c28..9421744 100644
--- a/src/compiler/codegen/x86/X86/Factory.cc
+++ b/src/compiler/codegen/x86/X86/Factory.cc
@@ -231,6 +231,7 @@
   if (rDest != rSrc1 && rDest != rSrc2) {
     if (op == kOpAdd) { // lea special case, except can't encode rbp as base
       if (rSrc1 == rSrc2) {
+        opRegCopy(cUnit, rDest, rSrc1);
         return opRegImm(cUnit, kOpLsl, rDest, 1);
       } else if (rSrc1 != rBP) {
         return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc1 /* base */,
@@ -285,9 +286,10 @@
     }
   }
   if (rDest != rSrc) {
-    if (op == kOpLsl && value >= 0 && value <= 3) { // lea shift special case
-      return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc /* base */,
-                     r4sib_no_index /* index */, value /* scale */, value /* disp */);
+    if (false && op == kOpLsl && value >= 0 && value <= 3) { // lea shift special case
+      // TODO: fix bug in LEA encoding when disp == 0
+      return newLIR5(cUnit, kX86Lea32RA, rDest,  r5sib_no_base /* base */,
+                     rSrc /* index */, value /* scale */, 0 /* disp */);
     } else if (op == kOpAdd) { // lea add special case
       return newLIR5(cUnit, kX86Lea32RA, rDest, rSrc /* base */,
                      r4sib_no_index /* index */, 0 /* scale */, value /* disp */);
@@ -351,6 +353,7 @@
                                int rIndex, int rDest, int scale, OpSize size)
 {
     UNIMPLEMENTED(WARNING) << "loadBaseIndexed";
+    newLIR0(cUnit, kX86Bkpt);
     return NULL;
 #if 0
     LIR *first = NULL;
@@ -406,6 +409,7 @@
 LIR *loadMultiple(CompilationUnit *cUnit, int rBase, int rMask)
 {
     UNIMPLEMENTED(WARNING) << "loadMultiple";
+    newLIR0(cUnit, kX86Bkpt);
     return NULL;
 #if 0
     int i;
@@ -432,6 +436,7 @@
 LIR *storeMultiple(CompilationUnit *cUnit, int rBase, int rMask)
 {
     UNIMPLEMENTED(WARNING) << "storeMultiple";
+    newLIR0(cUnit, kX86Bkpt);
     return NULL;
 #if 0
     int i;
diff --git a/src/compiler/codegen/x86/X86/Gen.cc b/src/compiler/codegen/x86/X86/Gen.cc
index f957cbc..f2dbc11 100644
--- a/src/compiler/codegen/x86/X86/Gen.cc
+++ b/src/compiler/codegen/x86/X86/Gen.cc
@@ -46,194 +46,93 @@
 }
 
 /*
- * The lack of pc-relative loads on X86 presents somewhat of a challenge
- * for our PIC switch table strategy.  To materialize the current location
- * we'll do a dummy JAL and reference our tables using r_RA as the
- * base register.  Note that r_RA will be used both as the base to
- * locate the switch table data and as the reference base for the switch
- * target offsets stored in the table.  We'll use a special pseudo-instruction
- * to represent the jal and trigger the construction of the
- * switch table offsets (which will happen after final assembly and all
- * labels are fixed).
- *
- * The test loop will look something like:
- *
- *   ori   rEnd, r_ZERO, #tableSize  ; size in bytes
- *   jal   BaseLabel         ; stores "return address" (BaseLabel) in r_RA
- *   nop                     ; opportunistically fill
- * BaseLabel:
- *   addiu rBase, r_RA, <table> - <BaseLabel>  ; table relative to BaseLabel
-     addu  rEnd, rEnd, rBase                   ; end of table
- *   lw    rVal, [rSP, vRegOff]                ; Test Value
- * loop:
- *   beq   rBase, rEnd, done
- *   lw    rKey, 0(rBase)
- *   addu  rBase, 8
- *   bne   rVal, rKey, loop
- *   lw    rDisp, -4(rBase)
- *   addu  r_RA, rDisp
- *   jr    r_RA
- * done:
- *
+ * The sparse table in the literal pool is an array of <key,displacement>
+ * pairs.
  */
-void genSparseSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
-{
-    UNIMPLEMENTED(WARNING) << "genSparseSwitch";
-    return;
-#if 0
-    const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
-    if (cUnit->printMe) {
-        dumpSparseSwitchTable(table);
-    }
-    // Add the table to the list - we'll process it later
-    SwitchTable *tabRec = (SwitchTable *)oatNew(cUnit, sizeof(SwitchTable),
-                         true, kAllocData);
-    tabRec->table = table;
-    tabRec->vaddr = mir->offset;
-    int elements = table[1];
-    tabRec->targets = (LIR* *)oatNew(cUnit, elements * sizeof(LIR*), true,
-                                     kAllocLIR);
-    oatInsertGrowableList(cUnit, &cUnit->switchTables, (intptr_t)tabRec);
-
-    // The table is composed of 8-byte key/disp pairs
-    int byteSize = elements * 8;
-
-    int sizeHi = byteSize >> 16;
-    int sizeLo = byteSize & 0xffff;
-
-    int rEnd = oatAllocTemp(cUnit);
-    if (sizeHi) {
-        newLIR2(cUnit, kX86Lui, rEnd, sizeHi);
-    }
-    // Must prevent code motion for the curr pc pair
-    genBarrier(cUnit);  // Scheduling barrier
-    newLIR0(cUnit, kX86CurrPC);  // Really a jal to .+8
-    // Now, fill the branch delay slot
-    if (sizeHi) {
-        newLIR3(cUnit, kX86Ori, rEnd, rEnd, sizeLo);
-    } else {
-        newLIR3(cUnit, kX86Ori, rEnd, r_ZERO, sizeLo);
-    }
-    genBarrier(cUnit);  // Scheduling barrier
-
-    // Construct BaseLabel and set up table base register
-    LIR* baseLabel = newLIR0(cUnit, kPseudoTargetLabel);
-    // Remember base label so offsets can be computed later
-    tabRec->anchor = baseLabel;
-    int rBase = oatAllocTemp(cUnit);
-    newLIR4(cUnit, kX86Delta, rBase, 0, (intptr_t)baseLabel, (intptr_t)tabRec);
-    opRegRegReg(cUnit, kOpAdd, rEnd, rEnd, rBase);
-
-    // Grab switch test value
-    rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
-
-    // Test loop
-    int rKey = oatAllocTemp(cUnit);
-    LIR* loopLabel = newLIR0(cUnit, kPseudoTargetLabel);
-    LIR* exitBranch = opCmpBranch(cUnit , kCondEq, rBase, rEnd, NULL);
-    loadWordDisp(cUnit, rBase, 0, rKey);
-    opRegImm(cUnit, kOpAdd, rBase, 8);
-    opCmpBranch(cUnit, kCondNe, rlSrc.lowReg, rKey, loopLabel);
-    int rDisp = oatAllocTemp(cUnit);
-    loadWordDisp(cUnit, rBase, -4, rDisp);
-    opRegRegReg(cUnit, kOpAdd, r_RA, r_RA, rDisp);
-    opReg(cUnit, kOpBx, r_RA);
-
-    // Loop exit
-    LIR* exitLabel = newLIR0(cUnit, kPseudoTargetLabel);
-    exitBranch->target = exitLabel;
-#endif
+BasicBlock *findBlock(CompilationUnit* cUnit, unsigned int codeOffset,
+                      bool split, bool create, BasicBlock** immedPredBlockP);
+void genSparseSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc, LIR* labelList) {
+  const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
+  if (cUnit->printMe) {
+    dumpSparseSwitchTable(table);
+  }
+  int entries = table[1];
+  int* keys = (int*)&table[2];
+  int* targets = &keys[entries];
+  rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+  for (int i = 0; i < entries; i++) {
+    int key = keys[i];
+    BasicBlock* case_block = findBlock(cUnit, mir->offset + targets[i],
+                                       false, false, NULL);
+    opCmpImmBranch(cUnit, kCondEq, rlSrc.lowReg, key, &labelList[case_block->id]);
+  }
 }
 
 /*
  * Code pattern will look something like:
  *
- *   lw    rVal
- *   jal   BaseLabel         ; stores "return address" (BaseLabel) in r_RA
- *   nop                     ; opportunistically fill
- *   [subiu rVal, bias]      ; Remove bias if lowVal != 0
- *   bound check -> done
- *   lw    rDisp, [r_RA, rVal]
- *   addu  r_RA, rDisp
- *   jr    r_RA
+ * mov  rVal, ..
+ * call 0
+ * pop  rStartOfMethod
+ * sub  rStartOfMethod, ..
+ * mov  rKeyReg, rVal
+ * sub  rKeyReg, lowKey
+ * cmp  rKeyReg, size-1  ; bound check
+ * ja   done
+ * mov  rDisp, [rStartOfMethod + rKeyReg * 4 + tableOffset]
+ * add  rStartOfMethod, rDisp
+ * jmp  rStartOfMethod
  * done:
  */
-void genPackedSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
-{
-    UNIMPLEMENTED(WARNING) << "genPackedSwitch";
-#if 0
-    const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
-    if (cUnit->printMe) {
-        dumpPackedSwitchTable(table);
-    }
-    // Add the table to the list - we'll process it later
-    SwitchTable *tabRec = (SwitchTable *)oatNew(cUnit, sizeof(SwitchTable),
-                                                true, kAllocData);
-    tabRec->table = table;
-    tabRec->vaddr = mir->offset;
-    int size = table[1];
-    tabRec->targets = (LIR* *)oatNew(cUnit, size * sizeof(LIR*), true,
-                                        kAllocLIR);
-    oatInsertGrowableList(cUnit, &cUnit->switchTables, (intptr_t)tabRec);
+void genPackedSwitch(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc) {
+  const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
+  if (cUnit->printMe) {
+    dumpPackedSwitchTable(table);
+  }
+  // Add the table to the list - we'll process it later
+  SwitchTable *tabRec = (SwitchTable *)oatNew(cUnit, sizeof(SwitchTable),
+                                              true, kAllocData);
+  tabRec->table = table;
+  tabRec->vaddr = mir->offset;
+  int size = table[1];
+  tabRec->targets = (LIR* *)oatNew(cUnit, size * sizeof(LIR*), true,
+                                   kAllocLIR);
+  oatInsertGrowableList(cUnit, &cUnit->switchTables, (intptr_t)tabRec);
 
-    // Get the switch value
-    rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+  // Get the switch value
+  rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+  int startOfMethodReg = oatAllocTemp(cUnit);
+  // Materialize a pointer to the switch table
+  //newLIR0(cUnit, kX86Bkpt);
+  newLIR1(cUnit, kX86StartOfMethod, startOfMethodReg);
+  int lowKey = s4FromSwitchData(&table[2]);
+  int keyReg;
+  // Remove the bias, if necessary
+  if (lowKey == 0) {
+    keyReg = rlSrc.lowReg;
+  } else {
+    keyReg = oatAllocTemp(cUnit);
+    opRegRegImm(cUnit, kOpSub, keyReg, rlSrc.lowReg, lowKey);
+  }
+  // Bounds check - if < 0 or >= size continue following switch
+  opRegImm(cUnit, kOpCmp, keyReg, size-1);
+  LIR* branchOver = opCondBranch(cUnit, kCondHi, NULL);
 
-    // Prepare the bias.  If too big, handle 1st stage here
-    int lowKey = s4FromSwitchData(&table[2]);
-    bool largeBias = false;
-    int rKey;
-    if (lowKey == 0) {
-        rKey = rlSrc.lowReg;
-    } else if ((lowKey & 0xffff) != lowKey) {
-        rKey = oatAllocTemp(cUnit);
-        loadConstant(cUnit, rKey, lowKey);
-        largeBias = true;
-    } else {
-        rKey = oatAllocTemp(cUnit);
-    }
+  // Load the displacement from the switch table
+  int dispReg = oatAllocTemp(cUnit);
+  newLIR5(cUnit, kX86PcRelLoadRA, dispReg, startOfMethodReg, keyReg, 2, (intptr_t)tabRec);
+  // Add displacement to start of method
+  opRegReg(cUnit, kOpAdd, startOfMethodReg, dispReg);
+  // ..and go!
+  LIR* switchBranch = newLIR1(cUnit, kX86JmpR, startOfMethodReg);
+  tabRec->anchor = switchBranch;
 
-    // Must prevent code motion for the curr pc pair
-    genBarrier(cUnit);
-    newLIR0(cUnit, kX86CurrPC);  // Really a jal to .+8
-    // Now, fill the branch delay slot with bias strip
-    if (lowKey == 0) {
-        newLIR0(cUnit, kX86Nop);
-    } else {
-        if (largeBias) {
-            opRegRegReg(cUnit, kOpSub, rKey, rlSrc.lowReg, rKey);
-        } else {
-            opRegRegImm(cUnit, kOpSub, rKey, rlSrc.lowReg, lowKey);
-        }
-    }
-    genBarrier(cUnit);  // Scheduling barrier
-
-    // Construct BaseLabel and set up table base register
-    LIR* baseLabel = newLIR0(cUnit, kPseudoTargetLabel);
-    // Remember base label so offsets can be computed later
-    tabRec->anchor = baseLabel;
-
-    // Bounds check - if < 0 or >= size continue following switch
-    LIR* branchOver = opCmpImmBranch(cUnit, kCondHi, rKey, size-1, NULL);
-
-    // Materialize the table base pointer
-    int rBase = oatAllocTemp(cUnit);
-    newLIR4(cUnit, kX86Delta, rBase, 0, (intptr_t)baseLabel, (intptr_t)tabRec);
-
-    // Load the displacement from the switch table
-    int rDisp = oatAllocTemp(cUnit);
-    loadBaseIndexed(cUnit, rBase, rKey, rDisp, 2, kWord);
-
-    // Add to r_AP and go
-    opRegRegReg(cUnit, kOpAdd, r_RA, r_RA, rDisp);
-    opReg(cUnit, kOpBx, r_RA);
-
-    /* branchOver target here */
-    LIR* target = newLIR0(cUnit, kPseudoTargetLabel);
-    branchOver->target = (LIR*)target;
-#endif
+  /* branchOver target here */
+  LIR* target = newLIR0(cUnit, kPseudoTargetLabel);
+  branchOver->target = (LIR*)target;
 }
 
+void callRuntimeHelperRegReg(CompilationUnit* cUnit, int helperOffset, int arg0, int arg1);
 /*
  * Array data table format:
  *  ushort ident = 0x0300   magic value
@@ -246,47 +145,31 @@
  */
 void genFillArrayData(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
 {
-    UNIMPLEMENTED(WARNING) << "genFillArrayData";
-#if 0
-    const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
-    // Add the table to the list - we'll process it later
-    FillArrayData *tabRec = (FillArrayData *)
-         oatNew(cUnit, sizeof(FillArrayData), true, kAllocData);
-    tabRec->table = table;
-    tabRec->vaddr = mir->offset;
-    u2 width = tabRec->table[1];
-    u4 size = tabRec->table[2] | (((u4)tabRec->table[3]) << 16);
-    tabRec->size = (size * width) + 8;
+  const u2* table = cUnit->insns + mir->offset + mir->dalvikInsn.vB;
+  // Add the table to the list - we'll process it later
+  FillArrayData *tabRec = (FillArrayData *)oatNew(cUnit, sizeof(FillArrayData), true, kAllocData);
+  tabRec->table = table;
+  tabRec->vaddr = mir->offset;
+  u2 width = tabRec->table[1];
+  u4 size = tabRec->table[2] | (((u4)tabRec->table[3]) << 16);
+  tabRec->size = (size * width) + 8;
 
-    oatInsertGrowableList(cUnit, &cUnit->fillArrayData, (intptr_t)tabRec);
+  oatInsertGrowableList(cUnit, &cUnit->fillArrayData, (intptr_t)tabRec);
 
-    // Making a call - use explicit registers
-    oatFlushAllRegs(cUnit);   /* Everything to home location */
-    oatLockCallTemps(cUnit);
-    loadValueDirectFixed(cUnit, rlSrc, rARG0);
-
-    // Must prevent code motion for the curr pc pair
-    genBarrier(cUnit);
-    newLIR0(cUnit, kX86CurrPC);  // Really a jal to .+8
-    // Now, fill the branch delay slot with the helper load
-    int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread,
-                          pHandleFillArrayDataFromCode));
-    genBarrier(cUnit);  // Scheduling barrier
-
-    // Construct BaseLabel and set up table base register
-    LIR* baseLabel = newLIR0(cUnit, kPseudoTargetLabel);
-
-    // Materialize a pointer to the fill data image
-    newLIR4(cUnit, kX86Delta, rARG1, 0, (intptr_t)baseLabel, (intptr_t)tabRec);
-
-    // And go...
-    callRuntimeHelper(cUnit, rTgt);  // ( array*, fill_data* )
-#endif
+  // Making a call - use explicit registers
+  oatFlushAllRegs(cUnit);   /* Everything to home location */
+  loadValueDirectFixed(cUnit, rlSrc, rARG0);
+  // Materialize a pointer to the fill data image
+  newLIR1(cUnit, kX86StartOfMethod, rARG2);
+  newLIR2(cUnit, kX86PcRelAdr, rARG1, (intptr_t)tabRec);
+  newLIR2(cUnit, kX86Add32RR, rARG1, rARG2);
+  callRuntimeHelperRegReg(cUnit, ENTRYPOINT_OFFSET(pHandleFillArrayDataFromCode), rARG0, rARG1);
 }
 
 void genNegFloat(CompilationUnit *cUnit, RegLocation rlDest, RegLocation rlSrc)
 {
     UNIMPLEMENTED(WARNING) << "genNegFloat";
+    newLIR0(cUnit, kX86Bkpt);
 #if 0
     RegLocation rlResult;
     rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
@@ -300,6 +183,7 @@
 void genNegDouble(CompilationUnit *cUnit, RegLocation rlDest, RegLocation rlSrc)
 {
     UNIMPLEMENTED(WARNING) << "genNegDouble";
+    newLIR0(cUnit, kX86Bkpt);
 #if 0
     RegLocation rlResult;
     rlSrc = loadValueWide(cUnit, rlSrc, kCoreReg);
@@ -311,21 +195,20 @@
 #endif
 }
 
+LIR* genNullCheck(CompilationUnit* cUnit, int sReg, int mReg, MIR* mir);
+void callRuntimeHelperReg(CompilationUnit* cUnit, int helperOffset, int arg0);
+
 /*
  * TODO: implement fast path to short-circuit thin-lock case
  */
 void genMonitorEnter(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
 {
-    UNIMPLEMENTED(WARNING) << "genMonitorEnter";
-#if 0
     oatFlushAllRegs(cUnit);
     loadValueDirectFixed(cUnit, rlSrc, rARG0);  // Get obj
     oatLockCallTemps(cUnit);  // Prepare for explicit register usage
     genNullCheck(cUnit, rlSrc.sRegLow, rARG0, mir);
     // Go expensive route - artLockObjectFromCode(self, obj);
-    int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread, pLockObjectFromCode));
-    callRuntimeHelper(cUnit, rTgt);
-#endif
+    callRuntimeHelperReg(cUnit, ENTRYPOINT_OFFSET(pLockObjectFromCode), rARG0);
 }
 
 /*
@@ -333,16 +216,12 @@
  */
 void genMonitorExit(CompilationUnit* cUnit, MIR* mir, RegLocation rlSrc)
 {
-    UNIMPLEMENTED(WARNING) << "genMonitor";
-#if 0
     oatFlushAllRegs(cUnit);
     loadValueDirectFixed(cUnit, rlSrc, rARG0);  // Get obj
     oatLockCallTemps(cUnit);  // Prepare for explicit register usage
     genNullCheck(cUnit, rlSrc.sRegLow, rARG0, mir);
     // Go expensive route - UnlockObjectFromCode(obj);
-    int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread, pUnlockObjectFromCode));
-    callRuntimeHelper(cUnit, rTgt);
-#endif
+    callRuntimeHelperReg(cUnit, ENTRYPOINT_OFFSET(pUnlockObjectFromCode), rARG0);
 }
 
 /*
@@ -364,26 +243,20 @@
 void genCmpLong(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
                 RegLocation rlSrc1, RegLocation rlSrc2)
 {
-    UNIMPLEMENTED(WARNING) << "genCmpLong";
-#if 0
-    rlSrc1 = loadValueWide(cUnit, rlSrc1, kCoreReg);
-    rlSrc2 = loadValueWide(cUnit, rlSrc2, kCoreReg);
-    int t0 = oatAllocTemp(cUnit);
-    int t1 = oatAllocTemp(cUnit);
-    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-    newLIR3(cUnit, kX86Slt, t0, rlSrc1.highReg, rlSrc2.highReg);
-    newLIR3(cUnit, kX86Slt, t1, rlSrc2.highReg, rlSrc1.highReg);
-    newLIR3(cUnit, kX86Subu, rlResult.lowReg, t1, t0);
-    LIR* branch = opCmpImmBranch(cUnit, kCondNe, rlResult.lowReg, 0, NULL);
-    newLIR3(cUnit, kX86Sltu, t0, rlSrc1.lowReg, rlSrc2.lowReg);
-    newLIR3(cUnit, kX86Sltu, t1, rlSrc2.lowReg, rlSrc1.lowReg);
-    newLIR3(cUnit, kX86Subu, rlResult.lowReg, t1, t0);
-    oatFreeTemp(cUnit, t0);
-    oatFreeTemp(cUnit, t1);
-    LIR* target = newLIR0(cUnit, kPseudoTargetLabel);
-    branch->target = (LIR*)target;
+    oatFlushAllRegs(cUnit);
+    oatLockCallTemps(cUnit);  // Prepare for explicit register usage
+    loadValueDirectWideFixed(cUnit, rlSrc1, r0, r1);
+    loadValueDirectWideFixed(cUnit, rlSrc1, r2, r3);
+    // Compute (r1:r0) = (r1:r0) - (r2:r3)
+    opRegReg(cUnit, kOpSub, r0, r2);  // r0 = r0 - r2
+    opRegReg(cUnit, kOpSbc, r1, r3);  // r1 = r1 - r3 - CF
+    opRegReg(cUnit, kOpOr, r0, r1);   // r0 = high | low - sets ZF
+    newLIR2(cUnit, kX86Set8R, r0, kX86CondNz);  // r0 = (r1:r0) != (r2:r3) ? 1 : 0
+    newLIR2(cUnit, kX86Movzx8RR, r0, r0);
+    opRegImm(cUnit, kOpAsr, r1, 31);  // r1 = high >> 31
+    opRegReg(cUnit, kOpOr, r0, r1);   // r0 holds result
+    RegLocation rlResult = LOC_C_RETURN;
     storeValue(cUnit, rlDest, rlResult);
-#endif
 }
 
 X86ConditionCode oatX86ConditionEncoding(ConditionCode cond) {
@@ -420,8 +293,12 @@
 LIR* opCmpImmBranch(CompilationUnit* cUnit, ConditionCode cond, int reg,
                     int checkValue, LIR* target)
 {
-  // TODO: when checkValue == 0 and reg is rCX, use the jcxz/nz opcode
-  newLIR2(cUnit, kX86Cmp32RI, reg, checkValue);
+  if (false && (checkValue == 0) && (cond == kCondEq || cond == kCondNe)) {
+    // TODO: when checkValue == 0 and reg is rCX, use the jcxz/nz opcode
+    // newLIR2(cUnit, kX86Test32RR, reg, reg);
+  } else {
+    newLIR2(cUnit, kX86Cmp32RI, reg, checkValue);
+  }
   X86ConditionCode cc = oatX86ConditionEncoding(cond);
   LIR* branch = newLIR2(cUnit, kX86Jcc8, 0 /* lir operand for Jcc offset */ , cc);
   branch->target = target;
@@ -458,10 +335,12 @@
       opRegCopy(cUnit, S2D(destLo, destHi), S2D(srcLo, srcHi));
     } else {
       UNIMPLEMENTED(WARNING);
+      newLIR0(cUnit, kX86Bkpt);
     }
   } else {
     if (srcFP) {
       UNIMPLEMENTED(WARNING);
+      newLIR0(cUnit, kX86Bkpt);
     } else {
       // Handle overlap
       if (srcHi == destLo) {
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index 1fc44b3..85d2565 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -194,6 +194,7 @@
   r4sib_no_index = r4sp,
   r5     = 5,
   rBP    = r5,
+  r5sib_no_base = r5,
   r6     = 6,
   rSI    = r6,
   r7     = 7,
@@ -277,7 +278,7 @@
     kX86CondNge = kX86CondL,  // not-greater-equal
 
     kX86CondNl  = 0xD,        // not-less-than
-    kX86CondGe  = kX86CondL,  // not-greater-equal
+    kX86CondGe  = kX86CondNl, // not-greater-equal
 
     kX86CondLe  = 0xE,        // less-than-equal
     kX86CondNg  = kX86CondLe, // not-greater
@@ -387,7 +388,6 @@
     BinaryShiftOpCode(kX86Rcl),
     BinaryShiftOpCode(kX86Rcr),
     BinaryShiftOpCode(kX86Sal),
-    BinaryShiftOpCode(kX86Shl),
     BinaryShiftOpCode(kX86Shr),
     BinaryShiftOpCode(kX86Sar),
 #undef BinaryShiftOpcode
@@ -447,12 +447,18 @@
 #undef Binary0fOpCode
     kX86Jcc8, kX86Jcc32,  // jCC rel8/32; lir operands - 0: rel, 1: CC, target assigned
     kX86Jmp8, kX86Jmp32,  // jmp rel8/32; lir operands - 0: rel, target assigned
+    kX86JmpR,   // jmp reg; lir operands - 0: reg
     kX86CallR,  // call reg; lir operands - 0: reg
     kX86CallM,  // call [base + disp]; lir operands - 0: base, 1: disp
     kX86CallA,  // call [base + index * scale + disp]
                 // lir operands - 0: base, 1: index, 2: scale, 3: disp
     kX86CallT,  // call fs:[disp]; fs: is equal to Thread::Current(); lir operands - 0: disp
     kX86Ret,    // ret; no lir operands
+    kX86StartOfMethod,    // call 0; pop reg; sub reg, # - generate start of method into reg
+                          // lir operands - 0: reg
+    kX86PcRelLoadRA, // mov reg, [base + index * scale + PC relative displacement]
+                     // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
+    kX86PcRelAdr, // mov reg, PC relative displacement; lir operands - 0: reg, 1: table
     kX86Last
 };
 
@@ -472,6 +478,8 @@
   kRegRegReg, kRegRegMem, kRegRegArray,    // RRR, RRM, RRA instruction kinds.
   kRegCond, kMemCond, kArrayCond,          // R, M, A instruction kinds following by a condition.
   kJmp, kJcc, kCall,           // Branch instruction kinds.
+  kPcRel,                      // Operation with displacement that is PC relative
+  kMacro,                      // An instruction composing multiple others
   kUnimplemented               // Encoding used when an instruction isn't yet implemented.
 };
 
diff --git a/src/compiler/codegen/x86/X86RallocUtil.cc b/src/compiler/codegen/x86/X86RallocUtil.cc
index ba5c063..2971632 100644
--- a/src/compiler/codegen/x86/X86RallocUtil.cc
+++ b/src/compiler/codegen/x86/X86RallocUtil.cc
@@ -96,9 +96,9 @@
 /* Clobber all regs that might be used by an external C call */
 extern void oatClobberCalleeSave(CompilationUnit *cUnit)
 {
-    oatClobber(cUnit, rBP);
-    oatClobber(cUnit, rSI);
-    oatClobber(cUnit, rDI);
+    oatClobber(cUnit, rAX);
+    oatClobber(cUnit, rCX);
+    oatClobber(cUnit, rDX);
 }
 
 extern RegLocation oatGetReturnWideAlt(CompilationUnit* cUnit) {
diff --git a/src/constants.cc b/src/constants.cc
deleted file mode 100644
index 12632eb..0000000
--- a/src/constants.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// TODO: automatically generate operator<<s for enum types.
-
-#include <iostream>
-
-#include "instruction_set.h"
-#include "invoke_type.h"
-
-namespace art {
-
-std::ostream& operator<<(std::ostream& os, const InstructionSet& rhs) {
-  switch (rhs) {
-    case kNone: os << "none"; break;
-    case kArm: os << "ARM"; break;
-    case kThumb2: os << "Thumb2"; break;
-    case kX86: os << "x86"; break;
-    case kMips: os << "MIPS"; break;
-    default: os << "InstructionSet[" << static_cast<int>(rhs) << "]"; break;
-  }
-  return os;
-}
-
-std::ostream& operator<<(std::ostream& os, const InvokeType& rhs) {
-  switch (rhs) {
-    case kStatic: os << "static"; break;
-    case kDirect: os << "direct"; break;
-    case kVirtual: os << "virtual"; break;
-    case kSuper: os << "super"; break;
-    case kInterface: os << "interface"; break;
-    default: os << "InvokeType[" << static_cast<int>(rhs) << "]"; break;
-  }
-  return os;
-}
-
-}  // namespace art
diff --git a/src/disassembler_x86.cc b/src/disassembler_x86.cc
index 4c8c09a..d7ee80b 100644
--- a/src/disassembler_x86.cc
+++ b/src/disassembler_x86.cc
@@ -57,16 +57,14 @@
   DumpReg0(os, rex, reg_num, byte_operand, size_override);
 }
 
-static void DumpBaseReg(std::ostream& os, uint8_t rex, uint8_t reg,
-                        bool byte_operand, uint8_t size_override) {
+static void DumpBaseReg(std::ostream& os, uint8_t rex, uint8_t reg) {
   size_t reg_num = reg;  // TODO: combine with REX.B on 64bit
-  DumpReg0(os, rex, reg_num, byte_operand, size_override);
+  DumpReg0(os, rex, reg_num, false, 0);
 }
 
-static void DumpIndexReg(std::ostream& os, uint8_t rex, uint8_t reg,
-                         bool byte_operand, uint8_t size_override) {
+static void DumpIndexReg(std::ostream& os, uint8_t rex, uint8_t reg) {
   int reg_num = reg;  // TODO: combine with REX.X on 64bit
-  DumpReg0(os, rex, reg_num, byte_operand, size_override);
+  DumpReg0(os, rex, reg_num, false, 0);
 }
 
 static void DumpSegmentOverride(std::ostream& os, uint8_t segment_prefix) {
@@ -88,7 +86,7 @@
   const char** modrm_opcodes = NULL;
   do {
     switch (*instr) {
-      // Group 1 - lock and repeat prefixes:
+        // Group 1 - lock and repeat prefixes:
       case 0xF0:
       case 0xF2:
       case 0xF3:
@@ -203,6 +201,20 @@
   case 0x0F:  // 2 byte extended opcode
     instr++;
     switch (*instr) {
+      case 0x10: case 0x11:
+        if (prefix[0] == 0xF2) {
+          opcode << "movsd";
+        } else if (prefix[0] == 0xF3) {
+          opcode << "movss";
+        } else if (prefix[2] == 0x66) {
+          opcode << "movupd";
+        } else {
+          opcode << "movups";
+        }
+        has_modrm = true;
+        load = *instr == 0x10;
+        store = !load;
+        break;
       case 0x38:  // 3 byte extended opcode
         opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
         break;
@@ -214,6 +226,16 @@
         opcode << "j" << condition_codes[*instr & 0xF];
         branch_bytes = 4;
         break;
+      case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97:
+      case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: case 0x9E: case 0x9F:
+        opcode << "set" << condition_codes[*instr & 0xF];
+        modrm_opcodes = NULL;
+        reg_is_opcode = true;
+        has_modrm = true;
+        store = true;
+        break;
+      case 0xB6: opcode << "movzxb"; has_modrm = true; load = true; break;
+      case 0xB7: opcode << "movzxw"; has_modrm = true; load = true; break;
       default:
         opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
         break;
@@ -228,6 +250,11 @@
     byte_operand = (*instr & 1) == 0;
     immediate_bytes = *instr == 0x81 ? 4 : 1;
     break;
+  case 0x8D:
+    opcode << "lea";
+    has_modrm = true;
+    load = true;
+    break;
   case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7:
     opcode << "mov";
     immediate_bytes = 1;
@@ -238,7 +265,19 @@
     immediate_bytes = 4;
     reg_in_opcode = true;
     break;
+  case 0xC0: case 0xC1:
+    static const char* shift_opcodes[] =
+        {"rol", "ror", "rcl", "rcr", "shl", "shr", "unknown-shift", "sar"};
+    modrm_opcodes = shift_opcodes;
+    has_modrm = true;
+    reg_is_opcode = true;
+    store = true;
+    immediate_bytes = 1;
+    byte_operand = *instr == 0xC0;
+    break;
   case 0xC3: opcode << "ret"; break;
+  case 0xCC: opcode << "int 3"; break;
+  case 0xE8: opcode << "call"; branch_bytes = 4; break;
   case 0xE9: opcode << "jmp"; branch_bytes = 4; break;
   case 0xEB: opcode << "jmp"; branch_bytes = 1; break;
   case 0xFF:
@@ -276,13 +315,13 @@
       uint8_t base = sib & 7;
       address << "[";
       if (base != 5 || mod != 0) {
-        DumpBaseReg(address, rex, base, byte_operand, prefix[2]);
+        DumpBaseReg(address, rex, base);
         if (index != 4) {
           address << " + ";
         }
       }
       if (index != 4) {
-        DumpIndexReg(address, rex, index, byte_operand, prefix[2]);
+        DumpIndexReg(address, rex, index);
         if (ss != 0) {
           address << StringPrintf(" * %d", 1 << ss);
         }
@@ -299,7 +338,7 @@
       if (mod != 3) {
         address << "[";
       }
-      DumpBaseReg(address, rex, rm, byte_operand, prefix[2]);
+      DumpBaseReg(address, rex, rm);
       if (mod == 1) {
         address << StringPrintf(" + %d", *reinterpret_cast<const int8_t*>(instr));
         instr++;
@@ -312,7 +351,7 @@
       }
     }
 
-    if (reg_is_opcode) {
+    if (reg_is_opcode && modrm_opcodes != NULL) {
       opcode << modrm_opcodes[reg_or_opcode];
     }
     if (load) {
diff --git a/src/indirect_reference_table.cc b/src/indirect_reference_table.cc
index ac5e402..14d804b 100644
--- a/src/indirect_reference_table.cc
+++ b/src/indirect_reference_table.cc
@@ -329,27 +329,6 @@
   }
 }
 
-std::ostream& operator<<(std::ostream& os, IndirectRefKind rhs) {
-  switch (rhs) {
-  case kSirtOrInvalid:
-    os << "stack indirect reference table or invalid reference";
-    break;
-  case kLocal:
-    os << "local reference";
-    break;
-  case kGlobal:
-    os << "global reference";
-    break;
-  case kWeakGlobal:
-    os << "weak global reference";
-    break;
-  default:
-    os << "IndirectRefKind[" << static_cast<int>(rhs) << "]";
-    break;
-  }
-  return os;
-}
-
 void IndirectReferenceTable::Dump() const {
   LOG(WARNING) << kind_ << " table dump:";
   std::vector<const Object*> entries(table_, table_ + Capacity());
diff --git a/src/indirect_reference_table.h b/src/indirect_reference_table.h
index b1f6d8c..b0a0d64 100644
--- a/src/indirect_reference_table.h
+++ b/src/indirect_reference_table.h
@@ -106,12 +106,12 @@
  * For convenience these match up with enum jobjectRefType from jni.h.
  */
 enum IndirectRefKind {
-    kSirtOrInvalid = 0,
-    kLocal         = 1,
-    kGlobal        = 2,
-    kWeakGlobal    = 3
+  kSirtOrInvalid = 0, // <<stack indirect reference table or invalid reference>>
+  kLocal         = 1, // <<local reference>>
+  kGlobal        = 2, // <<global reference>>
+  kWeakGlobal    = 3  // <<weak global reference>>
 };
-std::ostream& operator<<(std::ostream& os, IndirectRefKind rhs);
+std::ostream& operator<<(std::ostream& os, const IndirectRefKind& rhs);
 
 /*
  * Determine what kind of indirect reference this is.
diff --git a/src/invoke_type.h b/src/invoke_type.h
index f37f1b4..d724fdb 100644
--- a/src/invoke_type.h
+++ b/src/invoke_type.h
@@ -22,7 +22,11 @@
 namespace art {
 
 enum InvokeType {
-  kStatic, kDirect, kVirtual, kSuper, kInterface,
+  kStatic,    // <<static>>
+  kDirect,    // <<direct>>
+  kVirtual,   // <<virtual>>
+  kSuper,     // <<super>>
+  kInterface, // <<interface>>
   kMaxInvokeType = kInterface
 };
 
diff --git a/src/jdwp/jdwp.h b/src/jdwp/jdwp.h
index d297f6f..7d8bf22 100644
--- a/src/jdwp/jdwp.h
+++ b/src/jdwp/jdwp.h
@@ -87,8 +87,8 @@
  */
 enum JdwpTransportType {
   kJdwpTransportUnknown = 0,
-  kJdwpTransportSocket,       /* transport=dt_socket */
-  kJdwpTransportAndroidAdb,   /* transport=dt_android_adb */
+  kJdwpTransportSocket,       // transport=dt_socket
+  kJdwpTransportAndroidAdb,   // transport=dt_android_adb
 };
 std::ostream& operator<<(std::ostream& os, const JdwpTransportType& rhs);
 
diff --git a/src/mutex.cc b/src/mutex.cc
index 1e30543..b0b82f3 100644
--- a/src/mutex.cc
+++ b/src/mutex.cc
@@ -194,14 +194,4 @@
   }
 }
 
-std::ostream& operator<<(std::ostream& os, const MutexRank& rhs) {
-  switch (rhs) {
-    case kHeapLock: os << "HeapLock"; break;
-    case kThreadListLock: os << "ThreadListLock"; break;
-    case kThreadSuspendCountLock: os << "ThreadSuspendCountLock"; break;
-    default: os << "MutexRank[" << static_cast<int>(rhs) << "]"; break;
-  }
-  return os;
-}
-
 }  // namespace
diff --git a/src/oat/runtime/arm/oat_support_entrypoints_arm.cc b/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
index fcff424..69e9c98 100644
--- a/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
+++ b/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
@@ -27,7 +27,7 @@
 extern "C" void* art_check_and_alloc_array_from_code_with_access_check(uint32_t, void*, int32_t);
 
 // Cast entrypoints.
-extern uint32_t IsAssignableFromCode(const Class* klass, const Class* ref_class);
+extern "C" uint32_t artIsAssignableFromCode(const Class* klass, const Class* ref_class);
 extern "C" void art_can_put_array_element_from_code(void*, void*);
 extern "C" void art_check_cast_from_code(void*, void*);
 
@@ -152,7 +152,7 @@
   points->pCheckAndAllocArrayFromCodeWithAccessCheck = art_check_and_alloc_array_from_code_with_access_check;
 
   // Cast
-  points->pInstanceofNonTrivialFromCode = IsAssignableFromCode;
+  points->pInstanceofNonTrivialFromCode = artIsAssignableFromCode;
   points->pCanPutArrayElementFromCode = art_can_put_array_element_from_code;
   points->pCheckCastFromCode = art_check_cast_from_code;
 
@@ -220,7 +220,8 @@
   points->pF2l = F2L;
   points->pLadd = NULL;
   points->pLand = NULL;
-  points->pLdivmod = __aeabi_ldivmod;
+  points->pLdiv = __aeabi_ldivmod;
+  points->pLdivmod = __aeabi_ldivmod;  // result returned in r2:r3
   points->pLmul = __aeabi_lmul;
   points->pLor = NULL;
   points->pLsub = NULL;
diff --git a/src/oat/runtime/arm/runtime_support_arm.S b/src/oat/runtime/arm/runtime_support_arm.S
index 6163b3e..9c55e66 100644
--- a/src/oat/runtime/arm/runtime_support_arm.S
+++ b/src/oat/runtime/arm/runtime_support_arm.S
@@ -759,9 +759,11 @@
     DELIVER_PENDING_EXCEPTION
 
     .global art_trace_entry_from_code
+    .global art_trace_exit_from_code
     .extern artTraceMethodEntryFromCode
+    .extern artTraceMethodExitFromCode
     /*
-     * Routine that intercepts method calls.
+     * Routine that intercepts method calls and returns.
      */
     ALIGN_FUNCTION_ENTRY
 art_trace_entry_from_code:
@@ -772,14 +774,6 @@
     mov   r12, r0        @ r12 holds reference to code
     pop   {r0-r3}        @ restore arguments
     blx   r12            @ call method
-    /* intentional fallthrough */
-
-    .global art_trace_exit_from_code
-    .extern artTraceMethodExitFromCode
-    /*
-     * Routine that intercepts method returns.
-     */
-    ALIGN_FUNCTION_ENTRY
 art_trace_exit_from_code:
     push  {r0-r1}        @ save return value
     blx   artTraceMethodExitFromCode  @ ()
diff --git a/src/oat/runtime/mips/oat_support_entrypoints_mips.cc b/src/oat/runtime/mips/oat_support_entrypoints_mips.cc
index e20332a..62b20f2 100644
--- a/src/oat/runtime/mips/oat_support_entrypoints_mips.cc
+++ b/src/oat/runtime/mips/oat_support_entrypoints_mips.cc
@@ -218,6 +218,7 @@
   points->pF2l = F2L;
   points->pLadd = NULL;
   points->pLand = NULL;
+  points->pLdiv = NULL;
   points->pLdivmod = NULL;
   points->pLmul = NULL;
   points->pLor = NULL;
diff --git a/src/oat/runtime/oat_support_entrypoints.h b/src/oat/runtime/oat_support_entrypoints.h
index 0e59dd8..1a8e675 100644
--- a/src/oat/runtime/oat_support_entrypoints.h
+++ b/src/oat/runtime/oat_support_entrypoints.h
@@ -107,6 +107,7 @@
   int64_t (*pF2l)(float);
   int64_t (*pLadd)(int64_t, int64_t);
   int64_t (*pLand)(int64_t, int64_t);
+  int64_t (*pLdiv)(int64_t, int64_t);
   int64_t (*pLdivmod)(int64_t, int64_t);
   int64_t (*pLmul)(int64_t, int64_t);
   int64_t (*pLor)(int64_t, int64_t);
diff --git a/src/oat/runtime/support_cast.cc b/src/oat/runtime/support_cast.cc
index 987e764..139239f 100644
--- a/src/oat/runtime/support_cast.cc
+++ b/src/oat/runtime/support_cast.cc
@@ -20,7 +20,7 @@
 namespace art {
 
 // Assignable test for code, won't throw.  Null and equality tests already performed
-uint32_t IsAssignableFromCode(const Class* klass, const Class* ref_class) {
+extern "C" uint32_t artIsAssignableFromCode(const Class* klass, const Class* ref_class) {
   DCHECK(klass != NULL);
   DCHECK(ref_class != NULL);
   return klass->IsAssignableFrom(ref_class) ? 1 : 0;
diff --git a/src/oat/runtime/support_math.cc b/src/oat/runtime/support_math.cc
index adef64a..133b857 100644
--- a/src/oat/runtime/support_math.cc
+++ b/src/oat/runtime/support_math.cc
@@ -94,4 +94,12 @@
   }
 }
 
+extern "C" int64_t artLdivFromCode(int64_t a, int64_t b) {
+  return a / b;
+}
+
+extern "C" int64_t artLdivmodFromCode(int64_t a, int64_t b) {
+  return a % b;
+}
+
 }  // namespace art
diff --git a/src/oat/runtime/support_stubs.cc b/src/oat/runtime/support_stubs.cc
index 5f7d635..fb0b5a4 100644
--- a/src/oat/runtime/support_stubs.cc
+++ b/src/oat/runtime/support_stubs.cc
@@ -27,7 +27,7 @@
 // Lazily resolve a method. Called by stub code.
 const void* UnresolvedDirectMethodTrampolineFromCode(Method* called, Method** sp, Thread* thread,
                                                      Runtime::TrampolineType type) {
-  // TODO: this code is specific to ARM
+#if defined(__arm__)
   // On entry the stack pointed by sp is:
   // | argN       |  |
   // | ...        |  |
@@ -43,10 +43,35 @@
   // | R1         |    arg1
   // | R0         |
   // | Method*    |  <- sp
-  uintptr_t* regs = reinterpret_cast<uintptr_t*>(reinterpret_cast<byte*>(sp) + kPointerSize);
   DCHECK_EQ(48U, Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes());
   Method** caller_sp = reinterpret_cast<Method**>(reinterpret_cast<byte*>(sp) + 48);
+  uintptr_t* regs = reinterpret_cast<uintptr_t*>(reinterpret_cast<byte*>(sp) + kPointerSize);
   uintptr_t caller_pc = regs[10];
+#elif defined(__i386__)
+  // On entry the stack pointed by sp is:
+  // | argN        |  |
+  // | ...         |  |
+  // | arg4        |  |
+  // | arg3 spill  |  |  Caller's frame
+  // | arg2 spill  |  |
+  // | arg1 spill  |  |
+  // | Method*     | ---
+  // | Return      |
+  // | EBP,ESI,EDI |    callee saves
+  // | EBX         |    arg3
+  // | EDX         |    arg2
+  // | ECX         |    arg1
+  // | EAX/Method* |  <- sp
+  DCHECK_EQ(32U, Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes());
+  Method** caller_sp = reinterpret_cast<Method**>(reinterpret_cast<byte*>(sp) + 32);
+  uintptr_t* regs = reinterpret_cast<uintptr_t*>(reinterpret_cast<byte*>(sp));
+  uintptr_t caller_pc = regs[7];
+#else
+  UNIMPLEMENTED(FATAL);
+  Method** caller_sp = NULL;
+  uintptr_t* regs = NULL;
+  uintptr_t caller_pc = 0;
+#endif
   FinishCalleeSaveFrameSetup(thread, sp, Runtime::kRefsAndArgs);
   // Start new JNI local reference state
   JNIEnvExt* env = thread->GetJniEnv();
@@ -88,6 +113,7 @@
     shorty = mh.GetShorty();
     shorty_len = mh.GetShortyLength();
   }
+#if !defined(__i386__)
   // Discover shorty (avoid GCs)
   size_t args_in_regs = 0;
   for (size_t i = 1; i < shorty_len; i++) {
@@ -132,6 +158,7 @@
     }
     cur_arg = cur_arg + (c == 'J' || c == 'D' ? 2 : 1);
   }
+#endif
   // Resolve method filling in dex cache
   if (type == Runtime::kUnknownMethod) {
     called = linker->ResolveMethod(dex_method_idx, caller, !is_virtual);
diff --git a/src/oat/runtime/x86/context_x86.cc b/src/oat/runtime/x86/context_x86.cc
index 35bfd01..2af95bb 100644
--- a/src/oat/runtime/x86/context_x86.cc
+++ b/src/oat/runtime/x86/context_x86.cc
@@ -23,7 +23,7 @@
 
 X86Context::X86Context() {
 #ifndef NDEBUG
-  // Initialize registers with easy to spot debug values
+  // Initialize registers with easy to spot debug values.
   for (int i = 0; i < 8; i++) {
     gprs_[i] = 0xEBAD6070+i;
   }
@@ -37,8 +37,8 @@
   size_t spill_count = __builtin_popcount(core_spills);
   CHECK_EQ(method->GetFpSpillMask(), 0u);
   if (spill_count > 0) {
-    // Lowest number spill is furthest away, walk registers and fill into context
-    int j = 1;
+    // Lowest number spill is furthest away, walk registers and fill into context.
+    int j = 2;  // Offset j to skip return address spill.
     for (int i = 0; i < 8; i++) {
       if (((core_spills >> i) & 1) != 0) {
         gprs_[i] = fr.LoadCalleeSave(spill_count - j);
@@ -50,8 +50,11 @@
 
 void X86Context::DoLongJump() {
 #if defined(__i386__)
-  // Load ESP and EIP
-  gprs_[ESP] -= 4;  // push EIP for return
+  // We push all the registers using memory-memory pushes, we then pop-all to get the registers
+  // set up, we then pop esp which will move us down the stack to the delivery address. At the frame
+  // where the exception will be delivered, we push EIP so that the return will take us to the
+  // correct delivery instruction.
+  gprs_[ESP] -= 4;
   *(reinterpret_cast<uintptr_t*>(gprs_[ESP])) = eip_;
   asm volatile(
       "pushl %4\n\t"
diff --git a/src/oat/runtime/x86/oat_support_entrypoints_x86.cc b/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
index 5d525a9..dd139ee 100644
--- a/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
+++ b/src/oat/runtime/x86/oat_support_entrypoints_x86.cc
@@ -27,7 +27,7 @@
 extern "C" void* art_check_and_alloc_array_from_code_with_access_check(uint32_t, void*, int32_t);
 
 // Cast entrypoints.
-extern uint32_t IsAssignableFromCode(const Class* klass, const Class* ref_class);
+extern "C" uint32_t art_is_assignable_from_code(const Class* klass, const Class* ref_class);
 extern "C" void art_can_put_array_element_from_code(void*, void*);
 extern "C" void art_check_cast_from_code(void*, void*);
 
@@ -72,11 +72,16 @@
 extern int32_t CmplFloat(float a, float b);
 extern int64_t D2L(double d);
 extern int64_t F2L(float f);
+extern "C" int32_t art_idiv_from_code(int32_t, int32_t);
+extern "C" int32_t art_idivmod_from_code(int32_t, int32_t);
+extern "C" int64_t art_ldiv_from_code(int64_t, int64_t);
+extern "C" int64_t art_ldivmod_from_code(int64_t, int64_t);
 
 // Intrinsic entrypoints.
-extern "C" int32_t __memcmp16(void*, void*, int32_t);
+extern "C" int32_t art_memcmp16(void*, void*, int32_t);
 extern "C" int32_t art_indexof(void*, uint32_t, uint32_t, uint32_t);
 extern "C" int32_t art_string_compareto(void*, void*);
+extern "C" void* art_memcpy(void*, const void*, size_t);
 
 // Invoke entrypoints.
 const void* UnresolvedDirectMethodTrampolineFromCode(Method*, Method**, Thread*,
@@ -112,7 +117,7 @@
   points->pCheckAndAllocArrayFromCodeWithAccessCheck = art_check_and_alloc_array_from_code_with_access_check;
 
   // Cast
-  points->pInstanceofNonTrivialFromCode = IsAssignableFromCode;
+  points->pInstanceofNonTrivialFromCode = art_is_assignable_from_code;
   points->pCanPutArrayElementFromCode = art_can_put_array_element_from_code;
   points->pCheckCastFromCode = art_check_cast_from_code;
 
@@ -174,13 +179,14 @@
   points->pL2f = NULL;
   points->pD2iz = NULL;
   points->pF2iz = NULL;
-  points->pIdiv = NULL;
-  points->pIdivmod = NULL;
+  points->pIdiv = art_idiv_from_code;
+  points->pIdivmod = art_idivmod_from_code;
   points->pD2l = D2L;
   points->pF2l = F2L;
   points->pLadd = NULL;
   points->pLand = NULL;
-  points->pLdivmod = NULL;
+  points->pLdiv = art_ldiv_from_code;
+  points->pLdivmod = art_ldivmod_from_code;
   points->pLmul = NULL;
   points->pLor = NULL;
   points->pLsub = NULL;
@@ -191,9 +197,9 @@
 
   // Intrinsics
   points->pIndexOf = art_indexof;
-  points->pMemcmp16 = __memcmp16;
+  points->pMemcmp16 = art_memcmp16;
   points->pStringCompareTo = art_string_compareto;
-  points->pMemcpy = memcpy;
+  points->pMemcpy = art_memcpy;
 
   // Invocation
   points->pUnresolvedDirectMethodTrampolineFromCode = UnresolvedDirectMethodTrampolineFromCode;
diff --git a/src/oat/runtime/x86/runtime_support_x86.S b/src/oat/runtime/x86/runtime_support_x86.S
index 3333469..c6a3aad 100644
--- a/src/oat/runtime/x86/runtime_support_x86.S
+++ b/src/oat/runtime/x86/runtime_support_x86.S
@@ -26,7 +26,8 @@
     // Mac OS' as(1) uses $0, $1, and so on for macro arguments, and function names
     // are mangled with an extra underscore prefix. The use of $x for arguments
     // mean that literals need to be represented with $$x in macros.
-    #define VAR(name,index) _$index
+    #define SYMBOL(name) _ ## name
+    #define VAR(name,index) SYMBOL($index)
     #define LITERAL(value) $$value
 #else
     // Regular gas(1) lets you name macro parameters.
@@ -40,6 +41,7 @@
     // will screw us by inserting a space between the \ and the name. Even in this mode there's
     // no special meaning to $, so literals are still just $x.
     .altmacro
+    #define SYMBOL(name) name
     #define VAR(name,index) name&
     #define LITERAL(value) $value
 #endif
@@ -49,9 +51,15 @@
     .balign 16
 END_MACRO
 
+MACRO1(DEFINE_FUNCTION,c_name)
+    .globl VAR(c_name, 0)
+    ALIGN_FUNCTION_ENTRY
+VAR(c_name, 0):
+END_MACRO
+
     /*
      * Macro that sets up the callee save frame to conform with
-     * Runtime::CreateCalleeSaveMethod(...)
+     * Runtime::CreateCalleeSaveMethod(kSaveAll)
      */
 MACRO0(SETUP_SAVE_ALL_CALLEE_SAVE_FRAME)
     pushl %edi  // Save callee saves (ebx is saved/restored by the upcall)
@@ -60,16 +68,24 @@
     subl  LITERAL(16), %esp  // Grow stack by 4 words, bottom word will hold Method*
 END_MACRO
 
-MACRO0(RESTORE_CALLEE_SAVE_FRAME)
-    addl LITERAL(16), %esp  // Remove padding
-    popl %ebp  // Restore callee saves
-    popl %esi
-    popl %edi
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kRefsOnly)
+     */
+MACRO0(SETUP_REF_ONLY_CALLEE_SAVE_FRAME)
+    pushl %edi  // Save callee saves (ebx is saved/restored by the upcall)
+    pushl %esi
+    pushl %ebp
+    subl  LITERAL(16), %esp  // Grow stack by 4 words, bottom word will hold Method*
+END_MACRO
+
+MACRO0(RESTORE_REF_ONLY_CALLEE_SAVE_FRAME)
+    addl LITERAL(28), %esp  // Unwind stack up to return address
 END_MACRO
 
     /*
      * Macro that sets up the callee save frame to conform with
-     * Runtime::CreateCalleeSaveMethod(...)
+     * Runtime::CreateCalleeSaveMethod(kRefsAndArgs)
      */
 MACRO0(SETUP_REF_AND_ARG_CALLEE_SAVE_FRAME)
     pushl %edi  // Save callee saves
@@ -82,7 +98,10 @@
 END_MACRO
 
 MACRO0(RESTORE_REF_AND_ARG_CALLEE_SAVE_FRAME)
-    addl LITERAL(16), %esp  // Remove padding
+    addl LITERAL(4), %esp  // Remove padding
+    popl %ecx  // Restore args except eax
+    popl %edx
+    popl %ebx
     popl %ebp  // Restore callee saves
     popl %esi
     popl %edi
@@ -99,7 +118,7 @@
     subl  LITERAL(8), %esp                   // Alignment padding
     pushl %ecx                               // pass SP
     pushl %fs:THREAD_SELF_OFFSET             // pass Thread::Current()
-    call artDeliverPendingExceptionFromCode  // artDeliverExceptionFromCode(Thread*, SP)
+    call SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverExceptionFromCode(Thread*, SP)
     int3
 END_MACRO
 
@@ -111,8 +130,8 @@
     mov %esp, %ecx
     // Outgoing argument set up
     subl  LITERAL(8), %esp        // alignment padding
-    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
     pushl %ecx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
     call VAR(cxx_name, 1)         // cxx_name(Thread*, SP)
     int3                          // unreached
 END_MACRO
@@ -124,7 +143,7 @@
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
     mov %esp, %ecx
     // Outgoing argument set up
-    pushl LITERAL(0)              // alignment padding
+    pushl %eax                    // alignment padding
     pushl %ecx                    // pass SP
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
     pushl %eax                    // pass arg1
@@ -141,9 +160,9 @@
     // Outgoing argument set up
     pushl %edx                    // pass SP
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
-    pushl %eax                    // pass arg1
     pushl %ecx                    // pass arg2
-    call VAR(cxx_name, 1)         // cxx_name(arg2, arg1, Thread*, SP)
+    pushl %eax                    // pass arg1
+    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, Thread*, SP)
     int3                          // unreached
 END_MACRO
 
@@ -204,7 +223,39 @@
     .globl VAR(c_name, 0)
     ALIGN_FUNCTION_ENTRY
 VAR(c_name, 0):
-    int3
+    // Set up the callee save frame to conform with Runtime::CreateCalleeSaveMethod(kRefsAndArgs)
+    // return address
+    pushl %edi
+    pushl %esi
+    pushl %ebp
+    pushl %ebx
+    pushl %edx
+    pushl %ecx
+    pushl %eax  // <-- callee save Method* to go here
+    movl %esp, %edx  // remember SP
+    // Outgoing argument set up
+    subl LITERAL(12), %esp        // alignment padding
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl 32(%edx)                // pass caller Method*
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, arg3, Thread*, SP)
+    movl %edx, %edi               // save code pointer in EDI
+    addl LITERAL(36), %esp        // Pop arguments skip eax
+    popl %ecx                     // Restore args
+    popl %edx
+    popl %ebx
+    popl %ebp  // Restore callee saves.
+    popl %esi
+    // Swap EDI callee save with code pointer.
+    xchgl %edi, (%esp)
+    testl %eax, %eax              // Branch forward if exception pending.
+    jz    1f
+    // Tail call to intended method.
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
 END_MACRO
 
 INVOKE_TRAMPOLINE art_invoke_interface_trampoline, artInvokeInterfaceTrampoline
@@ -215,6 +266,191 @@
 INVOKE_TRAMPOLINE art_invoke_super_trampoline_with_access_check, artInvokeSuperTrampolineWithAccessCheck
 INVOKE_TRAMPOLINE art_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
+MACRO2(TWO_ARG_ALLOC, c_name, cxx_name)
+    .globl VAR(c_name, 0)
+    ALIGN_FUNCTION_ENTRY
+VAR(c_name, 0):
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, arg3, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax               // eax == 0 ?
+    jz  1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+END_MACRO
+
+MACRO2(THREE_ARG_ALLOC, c_name, cxx_name)
+    .globl VAR(c_name, 0)
+    ALIGN_FUNCTION_ENTRY
+VAR(c_name, 0):
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %ebx                // remember SP
+    // Outgoing argument set up
+    subl LITERAL(12), %esp        // alignment padding
+    pushl %ebx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %edx                    // pass arg3
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call VAR(cxx_name, 1)         // cxx_name(arg1, arg2, Thread*, SP)
+    addl LITERAL(32), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax               // eax == 0 ?
+    jz  1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+END_MACRO
+
+TWO_ARG_ALLOC art_alloc_object_from_code, artAllocObjectFromCode
+TWO_ARG_ALLOC art_alloc_object_from_code_with_access_check, artAllocObjectFromCodeWithAccessCheck
+THREE_ARG_ALLOC art_alloc_array_from_code, artAllocArrayFromCode
+THREE_ARG_ALLOC art_alloc_array_from_code_with_access_check, artAllocArrayFromCodeWithAccessCheck
+THREE_ARG_ALLOC art_check_and_alloc_array_from_code, artCheckAndAllocArrayFromCode
+THREE_ARG_ALLOC art_check_and_alloc_array_from_code_with_access_check, artCheckAndAllocArrayFromCodeWithAccessCheck
+
+TWO_ARG_ALLOC art_resolve_string_from_code, artResolveStringFromCode
+TWO_ARG_ALLOC art_initialize_static_storage_from_code, artInitializeStaticStorageFromCode
+
+DEFINE_FUNCTION art_lock_object_from_code
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %eax                    // alignment padding
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %eax                    // pass arg1
+    call SYMBOL(artLockObjectFromCode)    // (Object*, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    ret
+
+DEFINE_FUNCTION art_unlock_object_from_code
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %eax                    // alignment padding
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %eax                    // pass arg1
+    call SYMBOL(artUnlockObjectFromCode)  // (Object*, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax              // eax == 0 ?
+    jnz 1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+
+DEFINE_FUNCTION art_handle_fill_data_from_code
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call SYMBOL(artHandleFillArrayDataFromCode)  // (Array* array, const uint16_t* table, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax              // eax == 0 ?
+    jnz 1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+
+DEFINE_FUNCTION art_is_assignable_from_code
+    pushl %eax                    // alignment padding
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call SYMBOL(artIsAssignableFromCode)  // (Class* a, Class* b, Thread*, SP)
+    addl LITERAL(12), %esp        // pop arguments
+    ret
+
+DEFINE_FUNCTION art_memcpy
+    pushl %edx                    // pass arg3
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call SYMBOL(memcpy)           // (void*, const void*, size_t)
+    addl LITERAL(12), %esp        // pop arguments
+    ret
+
+DEFINE_FUNCTION art_check_cast_from_code
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call SYMBOL(artCheckCastFromCode)  // (Class* a, Class* b, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax              // eax == 0 ?
+    jnz 1f
+    ret
+1:
+    DELIVER_PENDING_EXCEPTION
+
+DEFINE_FUNCTION art_idiv_from_code
+    cdq         // edx:eax = sign extend eax
+    idiv %ecx   // (edx,eax) = (edx:eax % ecx, edx:eax / ecx)
+    ret
+
+DEFINE_FUNCTION art_idivmod_from_code
+    cdq         // edx:eax = sign extend eax
+    idiv %ecx   // (edx,eax) = (edx:eax % ecx, edx:eax / ecx)
+    movl %eax, %edx
+    ret
+
+DEFINE_FUNCTION art_ldiv_from_code
+    addl LITERAL(12), %esp        // alignment padding
+    pushl %ebx                    // pass arg4
+    pushl %edx                    // pass arg3
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call SYMBOL(artLdivFromCode)  // (jlong a, jlong b, Thread*, SP)
+    addl LITERAL(28), %esp        // pop arguments
+    ret
+
+DEFINE_FUNCTION art_ldivmod_from_code
+    addl LITERAL(12), %esp        // alignment padding
+    pushl %ebx                    // pass arg4
+    pushl %edx                    // pass arg3
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call SYMBOL(artLdivmodFromCode) // (jlong a, jlong b, Thread*, SP)
+    addl LITERAL(28), %esp        // pop arguments
+    ret
+
+DEFINE_FUNCTION art_can_put_array_element_from_code
+    test %eax, %eax               // Null is trivially storable
+    jz   1f
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
+    mov %esp, %edx                // remember SP
+    // Outgoing argument set up
+    pushl %edx                    // pass SP
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    pushl %ecx                    // pass arg2
+    pushl %eax                    // pass arg1
+    call SYMBOL(artCanPutArrayElementFromCode)  // (Object* element, Class* array_class, Thread*, SP)
+    addl LITERAL(16), %esp        // pop arguments
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    testl %eax, %eax              // eax == 0 ?
+    jnz 2f
+1:
+    ret
+2:
+    DELIVER_PENDING_EXCEPTION
+
 MACRO1(UNIMPLEMENTED,name)
     .globl VAR(name, 0)
     ALIGN_FUNCTION_ENTRY
@@ -226,18 +462,8 @@
 UNIMPLEMENTED art_proxy_invoke_handler
 UNIMPLEMENTED art_update_debugger
 UNIMPLEMENTED art_test_suspend
-UNIMPLEMENTED art_alloc_object_from_code
-UNIMPLEMENTED art_alloc_object_from_code_with_access_check
-UNIMPLEMENTED art_alloc_array_from_code
-UNIMPLEMENTED art_alloc_array_from_code_with_access_check
-UNIMPLEMENTED art_check_and_alloc_array_from_code
-UNIMPLEMENTED art_check_and_alloc_array_from_code_with_access_check
-UNIMPLEMENTED art_can_put_array_element_from_code
-UNIMPLEMENTED art_check_cast_from_code
-UNIMPLEMENTED art_initialize_static_storage_from_code
 UNIMPLEMENTED art_initialize_type_and_verify_access_from_code
 UNIMPLEMENTED art_initialize_type_from_code
-UNIMPLEMENTED art_resolve_string_from_code
 UNIMPLEMENTED art_set32_instance_from_code
 UNIMPLEMENTED art_set64_instance_from_code
 UNIMPLEMENTED art_set_obj_instance_from_code
@@ -250,9 +476,6 @@
 UNIMPLEMENTED art_get32_static_from_code
 UNIMPLEMENTED art_get64_static_from_code
 UNIMPLEMENTED art_get_obj_static_from_code
-UNIMPLEMENTED art_handle_fill_data_from_code
-UNIMPLEMENTED art_lock_object_from_code
-UNIMPLEMENTED art_unlock_object_from_code
 UNIMPLEMENTED art_indexof
-UNIMPLEMENTED __memcmp16
+UNIMPLEMENTED art_memcmp16
 UNIMPLEMENTED art_string_compareto
diff --git a/src/oat/runtime/x86/stub_x86.cc b/src/oat/runtime/x86/stub_x86.cc
index 14e4f23..1dea0a1 100644
--- a/src/oat/runtime/x86/stub_x86.cc
+++ b/src/oat/runtime/x86/stub_x86.cc
@@ -25,11 +25,40 @@
 namespace art {
 namespace x86 {
 
-ByteArray* X86CreateResolutionTrampoline(Runtime::TrampolineType) {
+ByteArray* X86CreateResolutionTrampoline(Runtime::TrampolineType type) {
   UniquePtr<X86Assembler> assembler(static_cast<X86Assembler*>(Assembler::Create(kX86)));
 
-  // TODO: unimplemented
-  __ int3();
+  // Set up the callee save frame to conform with Runtime::CreateCalleeSaveMethod(kRefsAndArgs)
+  // return address
+  __ pushl(EDI);
+  __ pushl(ESI);
+  __ pushl(EBP);
+  __ pushl(EBX);
+  __ pushl(EDX);
+  __ pushl(ECX);
+  __ pushl(EAX);  // <-- callee save Method* to go here
+  __ movl(ECX, ESP);          // save ESP
+  __ pushl(Immediate(type));  // pass is_static
+  __ fs()->pushl(Address::Absolute(Thread::SelfOffset()));  // Thread*
+  __ pushl(ECX);              // pass ESP for Method*
+  __ pushl(EAX);              // pass Method*
+
+  // Call to resolve method.
+  __ Call(ThreadOffset(ENTRYPOINT_OFFSET(pUnresolvedDirectMethodTrampolineFromCode)),
+          X86ManagedRegister::FromCpuRegister(ECX));
+
+  __ movl(EDI, EAX);  // save code pointer in EDI
+  __ addl(ESP, Immediate(16));  // Pop arguments
+  __ popl(EAX);  // Restore args.
+  __ popl(ECX);
+  __ popl(EDX);
+  __ popl(EBX);
+  __ popl(EBP);  // Restore callee saves.
+  __ popl(ESI);
+  // Swap EDI callee save with code pointer
+  __ xchgl(EDI, Address(ESP,0));
+  // Tail call to intended method.
+  __ ret();
 
   assembler->EmitSlowPaths();
   size_t cs = assembler->CodeSize();
@@ -46,6 +75,8 @@
 ByteArray* CreateAbstractMethodErrorStub() {
   UniquePtr<X86Assembler> assembler(static_cast<X86Assembler*>(Assembler::Create(kX86)));
 
+  // Set up the callee save frame to conform with Runtime::CreateCalleeSaveMethod(kSaveAll)
+
   // return address
   __ pushl(EDI);
   __ pushl(ESI);
@@ -66,6 +97,7 @@
 
 #if defined(ART_USE_LLVM_COMPILER)
   // Return to caller who will handle pending exception.
+  // TODO: The callee save set up is unnecessary for LLVM as it uses shadow stacks.
   __ addl(ESP, Immediate(32));
   __ popl(EBP);
   __ popl(ESI);
diff --git a/src/oat/utils/x86/assembler_x86.cc b/src/oat/utils/x86/assembler_x86.cc
index 0862551..569ec09 100644
--- a/src/oat/utils/x86/assembler_x86.cc
+++ b/src/oat/utils/x86/assembler_x86.cc
@@ -758,6 +758,12 @@
   EmitRegisterOperand(dst, src);
 }
 
+void X86Assembler::xchgl(Register reg, const Address& address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x87);
+  EmitOperand(reg, address);
+}
+
 
 void X86Assembler::cmpl(Register reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1172,6 +1178,11 @@
   EmitRegisterOperand(4, reg);
 }
 
+void X86Assembler::jmp(const Address& address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xFF);
+  EmitOperand(4, address);
+}
 
 void X86Assembler::jmp(Label* label) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/src/oat/utils/x86/assembler_x86.h b/src/oat/utils/x86/assembler_x86.h
index 886e173..58acab2 100644
--- a/src/oat/utils/x86/assembler_x86.h
+++ b/src/oat/utils/x86/assembler_x86.h
@@ -344,6 +344,7 @@
   void fptan();
 
   void xchgl(Register dst, Register src);
+  void xchgl(Register reg, const Address& address);
 
   void cmpl(Register reg, const Immediate& imm);
   void cmpl(Register reg0, Register reg1);
@@ -426,6 +427,7 @@
   void j(Condition condition, Label* label);
 
   void jmp(Register reg);
+  void jmp(const Address& address);
   void jmp(Label* label);
 
   X86Assembler* lock();
diff --git a/src/runtime.cc b/src/runtime.cc
index f434fed..c4a9bd7 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -964,8 +964,14 @@
     method->SetCoreSpillMask(core_spills);
     method->SetFpSpillMask(fp_spills);
   } else if (instruction_set == kX86) {
-    method->SetFrameSizeInBytes(32);
-    method->SetCoreSpillMask((1 << art::x86::EBP) | (1 << art::x86::ESI) | (1 << art::x86::EDI));
+    uint32_t ref_spills = (1 << art::x86::EBP) | (1 << art::x86::ESI) | (1 << art::x86::EDI);
+    uint32_t arg_spills = (1 << art::x86::ECX) | (1 << art::x86::EDX) | (1 << art::x86::EBX);
+    uint32_t core_spills = ref_spills | (type == kRefsAndArgs ? arg_spills : 0) |
+                         (1 << art::x86::kNumberOfCpuRegisters);  // fake return address callee save
+    size_t frame_size = RoundUp((__builtin_popcount(core_spills) /* gprs */ +
+                                 1 /* Method* */) * kPointerSize, kStackAlignment);
+    method->SetFrameSizeInBytes(frame_size);
+    method->SetCoreSpillMask(core_spills);
     method->SetFpSpillMask(0);
   } else {
     UNIMPLEMENTED(FATAL);
diff --git a/src/thread.cc b/src/thread.cc
index 846aa39..125480e 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -1189,7 +1189,9 @@
       if (LIKELY(!method_tracing_active)) {
         pc = ManglePc(return_pc);
       } else {
-        if (IsTraceExitPc(return_pc)) {
+        // While profiling, the return pc is restored from the side stack, except when walking
+        // the stack for an exception where the side stack will be unwound in VisitFrame.
+        if (IsTraceExitPc(return_pc) && !include_upcalls) {
           TraceStackFrame trace_frame = GetTraceStackFrame(trace_stack_depth++);
           CHECK(trace_frame.method_ == frame.GetMethod());
           pc = ManglePc(trace_frame.return_pc_);
@@ -1484,7 +1486,7 @@
     } else if (method->IsNative()) {
       native_method_count_++;
     } else {
-      // Unwind stack during method tracing
+      // Unwind stack when an exception occurs during method tracing
       if (UNLIKELY(method_tracing_active_)) {
         if (IsTraceExitPc(DemanglePc(pc))) {
           pc = ManglePc(TraceMethodUnwindFromCode(Thread::Current()));
diff --git a/src/trace.cc b/src/trace.cc
index c38f017..3b4c3e5 100644
--- a/src/trace.cc
+++ b/src/trace.cc
@@ -200,11 +200,6 @@
     return;
   }
 
-  // TODO: implement alloc counting.
-  if (flags != 0) {
-    UNIMPLEMENTED(FATAL) << "trace flags";
-  }
-
   ScopedThreadStateChange tsc(Thread::Current(), Thread::kRunnable);
   Runtime::Current()->GetThreadList()->SuspendAll(false);
 
@@ -226,7 +221,13 @@
   }
 
   // Create Trace object.
-  Trace* tracer(new Trace(trace_file, buffer_size));
+  Trace* tracer(new Trace(trace_file, buffer_size, flags));
+
+  // Enable count of allocs if specified in the flags.
+  if ((flags && kTraceCountAllocs) != 0) {
+    Runtime::Current()->SetStatsEnabled(true);
+  }
+
   Runtime::Current()->EnableMethodTracing(tracer);
   tracer->BeginTracing();
 
@@ -297,6 +298,10 @@
   size_t final_offset = cur_offset_;
   uint32_t clock_overhead = GetClockOverhead();
 
+  if ((flags_ & kTraceCountAllocs) != 0) {
+    Runtime::Current()->SetStatsEnabled(false);
+  }
+
   GetVisitedMethods(final_offset);
 
   std::ostringstream os;
@@ -317,6 +322,11 @@
   os << StringPrintf("num-method-calls=%zd\n", (final_offset - kTraceHeaderLength) / record_size_);
   os << StringPrintf("clock-call-overhead-nsec=%d\n", clock_overhead);
   os << StringPrintf("vm=art\n");
+  if ((flags_ & kTraceCountAllocs) != 0) {
+    os << StringPrintf("alloc-count=%d\n", Runtime::Current()->GetStat(KIND_ALLOCATED_OBJECTS));
+    os << StringPrintf("alloc-size=%d\n", Runtime::Current()->GetStat(KIND_ALLOCATED_BYTES));
+    os << StringPrintf("gc-count=%d\n", Runtime::Current()->GetStat(KIND_GC_INVOCATIONS));
+  }
   os << StringPrintf("%cthreads\n", kTraceTokenChar);
   DumpThreadList(os);
   os << StringPrintf("%cmethods\n", kTraceTokenChar);
diff --git a/src/trace.h b/src/trace.h
index b0366d9..8dbf924 100644
--- a/src/trace.h
+++ b/src/trace.h
@@ -52,6 +52,10 @@
     kMethodTraceUnwind = 2,
   };
 
+  enum TraceFlag {
+    kTraceCountAllocs = 1,
+  };
+
   static void Start(const char* trace_filename, int trace_fd, int buffer_size, int flags, bool direct_to_ddms);
   static void Stop();
   static void Shutdown();
@@ -66,9 +70,9 @@
   void ResetSavedCode(Method* method);
 
  private:
-  explicit Trace(File* trace_file, int buffer_size)
-      : trace_file_(trace_file), buf_(new uint8_t[buffer_size]()), overflow_(false), buffer_size_(buffer_size),
-        start_time_(0), trace_version_(0), record_size_(0), cur_offset_(0) {
+  explicit Trace(File* trace_file, int buffer_size, int flags)
+      : trace_file_(trace_file), buf_(new uint8_t[buffer_size]()), flags_(flags), overflow_(false),
+        buffer_size_(buffer_size), start_time_(0), trace_version_(0), record_size_(0), cur_offset_(0) {
   }
 
   void BeginTracing();
@@ -100,6 +104,9 @@
   // Buffer to store trace data.
   UniquePtr<uint8_t> buf_;
 
+  // Flags enabling extra tracing of things such as alloc counts.
+  int flags_;
+
   bool overflow_;
   int buffer_size_;
   uint64_t start_time_;
diff --git a/tools/generate-operator-out.py b/tools/generate-operator-out.py
new file mode 100755
index 0000000..ca7df1e
--- /dev/null
+++ b/tools/generate-operator-out.py
@@ -0,0 +1,153 @@
+#!/usr/bin/python2.4
+#
+# Copyright 2012 Google Inc. All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Generates default implementations of operator<< for enum types."""
+
+import codecs
+import os
+import re
+import string
+import sys
+
+
+_ENUM_START_RE = re.compile(r'\benum\b\s+(\S+)\s+\{')
+_ENUM_VALUE_RE = re.compile(r'([A-Za-z0-9_]+)(.*)')
+_ENUM_END_RE = re.compile(r'^\s*\};$')
+_ENUMS = {}
+
+def Confused(filename, line_number, line):
+  sys.stderr.write('%s:%d: confused by:\n%s\n' % (filename, line_number, line))
+  sys.exit(1)
+
+
+def ProcessFile(filename):
+  lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+  in_enum = False
+  line_number = 0
+  for raw_line in lines:
+    line_number += 1
+
+    # TODO: take namespaces and enclosing classes/structs into account.
+
+    # Is this the start of a new enum?
+    if not in_enum:
+      m = _ENUM_START_RE.search(raw_line)
+      if m:
+        # Yes, so add an empty entry to _ENUMS for this enum.
+        enum_name = m.group(1)
+        _ENUMS[enum_name] = []
+        in_enum = True
+      continue
+
+    # Is this the end of the current enum?
+    m = _ENUM_END_RE.search(raw_line)
+    if m:
+      if not in_enum:
+        Confused(filename, line_number, raw_line)
+      in_enum = False
+      continue
+
+    # Is this another enum value?
+    m = _ENUM_VALUE_RE.search(raw_line.strip())
+    if not m:
+      Confused(filename, line_number, raw_line)
+
+    enum_value = m.group(1)
+
+    # By default, we turn "kSomeValue" into "SomeValue".
+    enum_text = enum_value
+    if enum_text.startswith('k'):
+      enum_text = enum_text[1:]
+
+    # Lose literal values because we don't care; turn "= 123, // blah" into ", // blah".
+    rest = m.group(2).strip()
+    m_literal = re.compile(r'= (0x[0-9a-f]+|-?[0-9]+)').search(rest)
+    if m_literal:
+      rest = rest[(len(m_literal.group(0))):]
+
+    # With "kSomeValue = kOtherValue," we take the original and skip later synonyms.
+    # TODO: check that the rhs is actually an existing value.
+    if rest.startswith('= k'):
+      continue
+
+    # Remove any trailing comma and whitespace
+    if rest.startswith(','):
+      rest = rest[1:]
+    rest = rest.strip()
+
+    # Anything left should be a comment.
+    if len(rest) and not rest.startswith('// '):
+      print rest
+      Confused(filename, line_number, raw_line)
+
+    m_comment = re.compile(r'<<(.*?)>>').search(rest)
+    if m_comment:
+      enum_text = m_comment.group(1)
+
+    _ENUMS[enum_name].append((enum_value, enum_text))
+
+def main():
+  header_files = []
+  for header_file in sys.argv[1:]:
+    header_files.append(header_file)
+    ProcessFile(header_file)
+
+  print '#include <iostream>'
+  print
+
+  for header_file in header_files:
+    # Make gives us paths relative to the top of the tree, but our -I is art/.
+    # We also have -I art/src/, but icu4c is higher on the include path and has a "mutex.h" too.
+    header_file = header_file.replace('art/', '')
+    print '#include "%s"' % header_file
+
+  print
+  print 'namespace art {'
+  print
+
+  for enum_name in _ENUMS:
+    print '// This was automatically generated by %s --- do not edit!' % sys.argv[0]
+    print 'std::ostream& operator<<(std::ostream& os, const %s& rhs) {' % enum_name
+    print '  switch (rhs) {'
+    for (enum_value, enum_text) in _ENUMS[enum_name]:
+      print '    case %s: os << "%s"; break;' % (enum_value, enum_text)
+    print '    default: os << "%s[" << static_cast<int>(rhs) << "]"; break;' % enum_name
+    print '  }'
+    print '  return os;'
+    print '}'
+    print
+
+  print '} // namespace art'
+
+  sys.exit(0)
+
+
+if __name__ == '__main__':
+  main()