Compiler intrinsics

Add intrinsic support.  Some of these appear to be of
limited value, so we may end up removing a few.  In general,
the instrinsics provide small, but measurable, gains.

Only Arm is currently supported, but most of these should
work for our other targets as well.

This is an interim solution.  My plan is to the intrinsic
recognition action up into the basic block building phase once
we start doing inlining.

Change-Id: Ia2913f2cdecaa4e80469caf69dbf8e2f61d4506a
diff --git a/src/asm_support.h b/src/asm_support.h
index 6ba23bc..d5bc370 100644
--- a/src/asm_support.h
+++ b/src/asm_support.h
@@ -19,6 +19,11 @@
 
 #define SUSPEND_CHECK_INTERVAL (1000)
 
+#define STRING_VALUE_OFFSET 8
+#define STRING_COUNT_OFFSET 12
+#define STRING_OFFSET_OFFSET 20
+#define STRING_DATA_OFFSET 12
+
 #if defined(__arm__)
 #define rSUSPEND r4
 #define rSELF r9
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index 429772a..a440555 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -375,6 +375,7 @@
     GrowableList domPostOrderTraversal;
     GrowableList throwLaunchpads;
     GrowableList suspendLaunchpads;
+    GrowableList intrinsicLaunchpads;
     GrowableList compilerTemps;
     int* iDomList;
     ArenaBitVector* tryBlockAddr;
@@ -601,6 +602,8 @@
 
 void oatInsertLIRAfter(LIR* currentLIR, LIR* newLIR);
 
+MIR* oatFindMoveResult(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
+                       bool wide);
 /* Debug Utilities */
 void oatDumpCompilationUnit(CompilationUnit* cUnit);
 
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index a87c401..19282d1 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -1655,6 +1655,35 @@
     return mir;
 }
 
+/*
+ * To be used at an invoke mir.  If the logically next mir node represents
+ * a move-result, return it.  Else, return NULL.  If a move-result exists,
+ * it is required to immediately follow the invoke with no intervening
+ * opcodes or incoming arcs.  However, if the result of the invoke is not
+ * used, a move-result may not be present.
+ */
+MIR* oatFindMoveResult(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
+                       bool wide)
+{
+    BasicBlock* tbb = bb;
+    mir = advanceMIR(cUnit, &tbb, mir, NULL, false);
+    while (mir != NULL) {
+        if (!wide && mir->dalvikInsn.opcode == Instruction::MOVE_RESULT) {
+            break;
+        }
+        if (wide && mir->dalvikInsn.opcode == Instruction::MOVE_RESULT_WIDE) {
+            break;
+        }
+        // Keep going if pseudo op, otherwise terminate
+        if (mir->dalvikInsn.opcode < static_cast<Instruction::Code>(kNumPackedOpcodes)) {
+            mir = NULL;
+        } else {
+            mir = advanceMIR(cUnit, &tbb, mir, NULL, false);
+        }
+    }
+    return mir;
+}
+
 void squashDupRangeChecks(CompilationUnit* cUnit, BasicBlock** pBp, MIR* mir,
                           int arraySreg, int indexSreg)
 {
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index b2676a6..651087f 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -831,6 +831,11 @@
     oatInitGrowableList(cUnit.get(), &cUnit->throwLaunchpads, cUnit->insnsSize,
                         kListThrowLaunchPads);
 
+    /* Intialize the instrinsicLaunchpads list */
+    oatInitGrowableList(cUnit.get(), &cUnit->intrinsicLaunchpads, 4,
+                        kListMisc);
+
+
     /* Intialize the suspendLaunchpads list */
     oatInitGrowableList(cUnit.get(), &cUnit->suspendLaunchpads, 2048,
                         kListSuspendLaunchPads);
@@ -984,7 +989,7 @@
     if (!(cUnit->disableOpt & (1 << kSkipLargeMethodOptimization))) {
         if ((cUnit->numBlocks > MANY_BLOCKS) ||
               ((cUnit->numBlocks > MANY_BLOCKS_INITIALIZER) &&
-               PrettyMethod(method_idx, dex_file).find("init>") !=
+               PrettyMethod(method_idx, dex_file, false).find("init>") !=
                std::string::npos)) {
             cUnit->qdMode = true;
         }
diff --git a/src/compiler/codegen/CodegenUtil.cc b/src/compiler/codegen/CodegenUtil.cc
index f2449e5..2318a04 100644
--- a/src/compiler/codegen/CodegenUtil.cc
+++ b/src/compiler/codegen/CodegenUtil.cc
@@ -296,6 +296,9 @@
         case kPseudoThrowTarget:
             LOG(INFO) << "LT" << (void*)lir << ":";
             break;
+        case kPseudoIntrinsicRetry:
+            LOG(INFO) << "IR" << (void*)lir << ":";
+            break;
         case kPseudoSuspendTarget:
             LOG(INFO) << "LS" << (void*)lir << ":";
             break;
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index 3ca0450..0ef1641 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -21,7 +21,8 @@
  * be applicable to most targets.  Only mid-level support utilities
  * and "op" calls may be used here.
  */
-
+void genInvoke(CompilationUnit* cUnit, BasicBlock* bb,  MIR* mir,
+               InvokeType type, bool isRange);
 #if defined(TARGET_ARM)
 LIR* opIT(CompilationUnit* cUnit, ArmConditionCode cond, const char* guide);
 #endif
@@ -835,6 +836,7 @@
     int numElems = cUnit->suspendLaunchpads.numUsed;
     for (int i = 0; i < numElems; i++) {
         oatResetRegPool(cUnit);
+        oatResetDefTracking(cUnit);
         LIR* lab = suspendLabel[i];
         LIR* resumeLab = (LIR*)lab->operands[0];
         cUnit->currentDalvikOffset = lab->operands[1];
@@ -851,12 +853,34 @@
     }
 }
 
+void handleIntrinsicLaunchpads(CompilationUnit *cUnit)
+{
+    LIR** intrinsicLabel = (LIR **)cUnit->intrinsicLaunchpads.elemList;
+    int numElems = cUnit->intrinsicLaunchpads.numUsed;
+    for (int i = 0; i < numElems; i++) {
+        oatResetRegPool(cUnit);
+        oatResetDefTracking(cUnit);
+        LIR* lab = intrinsicLabel[i];
+        MIR* mir = (MIR*)lab->operands[0];
+        InvokeType type = (InvokeType)lab->operands[1];
+        BasicBlock* bb = (BasicBlock*)lab->operands[3];
+        cUnit->currentDalvikOffset = mir->offset;
+        oatAppendLIR(cUnit, lab);
+        genInvoke(cUnit, bb, mir, type, false /* isRange */);
+        LIR* resumeLab = (LIR*)lab->operands[2];
+        if (resumeLab != NULL) {
+            opUnconditionalBranch(cUnit, resumeLab);
+        }
+    }
+}
+
 void handleThrowLaunchpads(CompilationUnit *cUnit)
 {
     LIR** throwLabel = (LIR **)cUnit->throwLaunchpads.elemList;
     int numElems = cUnit->throwLaunchpads.numUsed;
     for (int i = 0; i < numElems; i++) {
         oatResetRegPool(cUnit);
+        oatResetDefTracking(cUnit);
         LIR* lab = throwLabel[i];
         cUnit->currentDalvikOffset = lab->operands[1];
         oatAppendLIR(cUnit, lab);
diff --git a/src/compiler/codegen/GenInvoke.cc b/src/compiler/codegen/GenInvoke.cc
index ebc8bc2..ba027f0 100644
--- a/src/compiler/codegen/GenInvoke.cc
+++ b/src/compiler/codegen/GenInvoke.cc
@@ -25,6 +25,8 @@
 typedef int (*NextCallInsn)(CompilationUnit*, MIR*, int, uint32_t dexIdx,
                             uint32_t methodIdx, uintptr_t directCode,
                             uintptr_t directMethod, InvokeType type);
+LIR* opCondBranch(CompilationUnit* cUnit, ConditionCode cc, LIR* target);
+
 /*
  * If there are any ins passed in registers that have not been promoted
  * to a callee-save register, flush them to the frame.  Perform intial
@@ -596,4 +598,339 @@
     return callState;
 }
 
+RegLocation inlineTarget(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir)
+{
+    RegLocation res;
+    mir = oatFindMoveResult(cUnit, bb, mir, false);
+    if (mir == NULL) {
+        res = oatGetReturn(cUnit, false);
+    } else {
+        res = oatGetDest(cUnit, mir, 0);
+        mir->dalvikInsn.opcode = Instruction::NOP;
+    }
+    return res;
+}
+
+RegLocation inlineTargetWide(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir)
+{
+    RegLocation res;
+    mir = oatFindMoveResult(cUnit, bb, mir, true);
+    if (mir == NULL) {
+        res = oatGetReturnWide(cUnit, false);
+    } else {
+        res = oatGetDestWide(cUnit, mir, 0, 1);
+        mir->dalvikInsn.opcode = Instruction::NOP;
+    }
+    return res;
+}
+
+bool genInlinedCharAt(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
+                      InvokeType type, bool isRange)
+{
+#if defined(TARGET_ARM)
+    // Location of reference to data array
+    int valueOffset = String::ValueOffset().Int32Value();
+    // Location of count
+    int countOffset = String::CountOffset().Int32Value();
+    // Starting offset within data array
+    int offsetOffset = String::OffsetOffset().Int32Value();
+    // Start of char data with array_
+    int dataOffset = Array::DataOffset(sizeof(uint16_t)).Int32Value();
+
+    RegLocation rlObj = oatGetSrc(cUnit, mir, 0);
+    RegLocation rlIdx = oatGetSrc(cUnit, mir, 1);
+    rlObj = loadValue(cUnit, rlObj, kCoreReg);
+    rlIdx = loadValue(cUnit, rlIdx, kCoreReg);
+    int regMax;
+    int regOff = oatAllocTemp(cUnit);
+    int regPtr = oatAllocTemp(cUnit);
+    genNullCheck(cUnit, rlObj.sRegLow, rlObj.lowReg, mir);
+    bool rangeCheck = (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK));
+    if (rangeCheck) {
+        regMax = oatAllocTemp(cUnit);
+        loadWordDisp(cUnit, rlObj.lowReg, countOffset, regMax);
+    }
+    loadWordDisp(cUnit, rlObj.lowReg, offsetOffset, regOff);
+    loadWordDisp(cUnit, rlObj.lowReg, valueOffset, regPtr);
+    LIR* launchPad = NULL;
+    if (rangeCheck) {
+        // Set up a launch pad to allow retry in case of bounds violation */
+        launchPad = rawLIR(cUnit, 0, kPseudoIntrinsicRetry, (int)mir, type);
+        oatInsertGrowableList(cUnit, &cUnit->intrinsicLaunchpads,
+                              (intptr_t)launchPad);
+        opRegReg(cUnit, kOpCmp, rlIdx.lowReg, regMax);
+        oatFreeTemp(cUnit, regMax);
+        opCondBranch(cUnit, kCondCs, launchPad);
+    }
+    opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
+    opRegReg(cUnit, kOpAdd, regOff, rlIdx.lowReg);
+    RegLocation rlDest = inlineTarget(cUnit, bb, mir);
+    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
+    loadBaseIndexed(cUnit, regPtr, regOff, rlResult.lowReg, 1, kUnsignedHalf);
+    oatFreeTemp(cUnit, regOff);
+    oatFreeTemp(cUnit, regPtr);
+    storeValue(cUnit, rlDest, rlResult);
+    if (rangeCheck) {
+        launchPad->operands[2] = NULL;  // no resumption
+        launchPad->operands[3] = (uintptr_t)bb;
+    }
+    // Record that we've already inlined & null checked
+    mir->optimizationFlags |= (MIR_INLINED | MIR_IGNORE_NULL_CHECK);
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool genInlinedMinMaxInt(CompilationUnit *cUnit, BasicBlock* bb, MIR *mir,
+                         bool isMin)
+{
+#if defined(TARGET_ARM)
+    RegLocation rlSrc1 = oatGetSrc(cUnit, mir, 0);
+    RegLocation rlSrc2 = oatGetSrc(cUnit, mir, 1);
+    rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
+    rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
+    RegLocation rlDest = inlineTarget(cUnit, bb, mir);
+    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
+    opRegReg(cUnit, kOpCmp, rlSrc1.lowReg, rlSrc2.lowReg);
+    opIT(cUnit, (isMin) ? kArmCondGt : kArmCondLt, "E");
+    opRegReg(cUnit, kOpMov, rlResult.lowReg, rlSrc2.lowReg);
+    opRegReg(cUnit, kOpMov, rlResult.lowReg, rlSrc1.lowReg);
+    genBarrier(cUnit);
+    storeValue(cUnit, rlDest, rlResult);
+    return true;
+#else
+    return false;
+#endif
+}
+
+// Generates an inlined String.isEmpty or String.length.
+bool genInlinedStringIsEmptyOrLength(CompilationUnit* cUnit,
+                                            BasicBlock* bb, MIR* mir,
+                                            bool isEmpty)
+{
+#if defined(TARGET_ARM)
+    // dst = src.length();
+    RegLocation rlObj = oatGetSrc(cUnit, mir, 0);
+    rlObj = loadValue(cUnit, rlObj, kCoreReg);
+    RegLocation rlDest = inlineTarget(cUnit, bb, mir);
+    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
+    genNullCheck(cUnit, rlObj.sRegLow, rlObj.lowReg, mir);
+    loadWordDisp(cUnit, rlObj.lowReg, String::CountOffset().Int32Value(),
+                 rlResult.lowReg);
+    if (isEmpty) {
+        // dst = (dst == 0);
+        int tReg = oatAllocTemp(cUnit);
+        opRegReg(cUnit, kOpNeg, tReg, rlResult.lowReg);
+        opRegRegReg(cUnit, kOpAdc, rlResult.lowReg, rlResult.lowReg, tReg);
+    }
+    storeValue(cUnit, rlDest, rlResult);
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool genInlinedAbsInt(CompilationUnit *cUnit, BasicBlock* bb, MIR *mir)
+{
+#if defined(TARGET_ARM)
+    RegLocation rlSrc = oatGetSrc(cUnit, mir, 0);
+    rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+    RegLocation rlDest = inlineTarget(cUnit, bb, mir);
+    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
+    int signReg = oatAllocTemp(cUnit);
+    // abs(x) = y<=x>>31, (x+y)^y.
+    opRegRegImm(cUnit, kOpAsr, signReg, rlSrc.lowReg, 31);
+    opRegRegReg(cUnit, kOpAdd, rlResult.lowReg, rlSrc.lowReg, signReg);
+    opRegReg(cUnit, kOpXor, rlResult.lowReg, signReg);
+    storeValue(cUnit, rlDest, rlResult);
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool genInlinedAbsLong(CompilationUnit *cUnit, BasicBlock* bb, MIR *mir)
+{
+#if defined(TARGET_ARM)
+    RegLocation rlSrc = oatGetSrcWide(cUnit, mir, 0, 1);
+    rlSrc = loadValueWide(cUnit, rlSrc, kCoreReg);
+    RegLocation rlDest = inlineTargetWide(cUnit, bb, mir);
+    RegLocation rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
+    int signReg = oatAllocTemp(cUnit);
+    // abs(x) = y<=x>>31, (x+y)^y.
+    opRegRegImm(cUnit, kOpAsr, signReg, rlSrc.highReg, 31);
+    opRegRegReg(cUnit, kOpAdd, rlResult.lowReg, rlSrc.lowReg, signReg);
+    opRegRegReg(cUnit, kOpAdc, rlResult.highReg, rlSrc.highReg, signReg);
+    opRegReg(cUnit, kOpXor, rlResult.lowReg, signReg);
+    opRegReg(cUnit, kOpXor, rlResult.highReg, signReg);
+    storeValueWide(cUnit, rlDest, rlResult);
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool genInlinedFloatCvt(CompilationUnit *cUnit, BasicBlock* bb, MIR *mir)
+{
+#if defined(TARGET_ARM)
+    RegLocation rlSrc = oatGetSrc(cUnit, mir, 0);
+    RegLocation rlDest = inlineTarget(cUnit, bb, mir);
+    storeValue(cUnit, rlDest, rlSrc);
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool genInlinedDoubleCvt(CompilationUnit *cUnit, BasicBlock* bb, MIR *mir)
+{
+#if defined(TARGET_ARM)
+    RegLocation rlSrc = oatGetSrcWide(cUnit, mir, 0, 1);
+    RegLocation rlDest = inlineTargetWide(cUnit, bb, mir);
+    storeValueWide(cUnit, rlDest, rlSrc);
+    return true;
+#else
+    return false;
+#endif
+}
+
+/*
+ * Fast string.indexOf(I) & (II).  Tests for simple case of char <= 0xffff,
+ * otherwise bails to standard library code.
+ */
+bool genInlinedIndexOf(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
+                       InvokeType type, bool zeroBased)
+{
+#if defined(TARGET_ARM)
+
+    oatClobberCalleeSave(cUnit);
+    oatLockCallTemps(cUnit);  // Using fixed registers
+    int regPtr = rARG0;
+    int regChar = rARG1;
+    int regStart = rARG2;
+
+    RegLocation rlObj = oatGetSrc(cUnit, mir, 0);
+    RegLocation rlChar = oatGetSrc(cUnit, mir, 1);
+    RegLocation rlStart = oatGetSrc(cUnit, mir, 2);
+    loadValueDirectFixed(cUnit, rlObj, regPtr);
+    loadValueDirectFixed(cUnit, rlChar, regChar);
+    if (zeroBased) {
+        loadConstant(cUnit, regStart, 0);
+    } else {
+        loadValueDirectFixed(cUnit, rlStart, regStart);
+    }
+    int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread, pIndexOf));
+    genNullCheck(cUnit, rlObj.sRegLow, regPtr, mir);
+    LIR* launchPad = rawLIR(cUnit, 0, kPseudoIntrinsicRetry, (int)mir, type);
+    oatInsertGrowableList(cUnit, &cUnit->intrinsicLaunchpads,
+                          (intptr_t)launchPad);
+    opCmpImmBranch(cUnit, kCondGt, regChar, 0xFFFF, launchPad);
+    opReg(cUnit, kOpBlx, rTgt);
+    LIR* resumeTgt = newLIR0(cUnit, kPseudoTargetLabel);
+    launchPad->operands[2] = (uintptr_t)resumeTgt;
+    launchPad->operands[3] = (uintptr_t)bb;
+    // Record that we've already inlined & null checked
+    mir->optimizationFlags |= (MIR_INLINED | MIR_IGNORE_NULL_CHECK);
+    return true;
+#else
+    return false;
+#endif
+}
+
+/* Fast string.compareTo(Ljava/lang/string;)I. */
+bool genInlinedStringCompareTo(CompilationUnit* cUnit, BasicBlock* bb,
+                               MIR* mir, InvokeType type)
+{
+#if defined(TARGET_ARM)
+    oatClobberCalleeSave(cUnit);
+    oatLockCallTemps(cUnit);  // Using fixed registers
+    int regThis = rARG0;
+    int regCmp = rARG1;
+
+    RegLocation rlThis = oatGetSrc(cUnit, mir, 0);
+    RegLocation rlCmp = oatGetSrc(cUnit, mir, 1);
+    loadValueDirectFixed(cUnit, rlThis, regThis);
+    loadValueDirectFixed(cUnit, rlCmp, regCmp);
+    int rTgt = loadHelper(cUnit, OFFSETOF_MEMBER(Thread, pStringCompareTo));
+    genNullCheck(cUnit, rlThis.sRegLow, regThis, mir);
+    //TUNING: check if rlCmp.sRegLow is already null checked
+    LIR* launchPad = rawLIR(cUnit, 0, kPseudoIntrinsicRetry, (int)mir, type);
+    oatInsertGrowableList(cUnit, &cUnit->intrinsicLaunchpads,
+                          (intptr_t)launchPad);
+    opCmpImmBranch(cUnit, kCondEq, regCmp, 0, launchPad);
+    opReg(cUnit, kOpBlx, rTgt);
+    launchPad->operands[2] = NULL;  // No return possible
+    launchPad->operands[3] = (uintptr_t)bb;
+    // Record that we've already inlined & null checked
+    mir->optimizationFlags |= (MIR_INLINED | MIR_IGNORE_NULL_CHECK);
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool genIntrinsic(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
+                  InvokeType type, bool isRange)
+{
+    if ((mir->optimizationFlags & MIR_INLINED) || isRange)  {
+        return false;
+    }
+    /*
+     * TODO: move these to a target-specific structured constant array
+     * and use a generic match function.  The list of intrinsics may be
+     * slightly different depending on target.
+     * TODO: Fold this into a matching function that runs during
+     * basic block building.  This should be part of the action for
+     * small method inlining and recognition of the special object init
+     * method.  By doing this during basic block construction, we can also
+     * take advantage of/generate new useful dataflow info.
+     */
+    std::string tgtMethod = PrettyMethod(mir->dalvikInsn.vB, *cUnit->dex_file);
+    if (tgtMethod.compare("char java.lang.String.charAt(int)") == 0) {
+        return genInlinedCharAt(cUnit, bb, mir, type, isRange);
+    }
+    if (tgtMethod.compare("int java.lang.Math.min(int, int)") == 0) {
+        return genInlinedMinMaxInt(cUnit, bb, mir, true /* isMin */);
+    }
+    if (tgtMethod.compare("int java.lang.Math.max(int, int)") == 0) {
+        return genInlinedMinMaxInt(cUnit, bb, mir, false /* isMin */);
+    }
+    if (tgtMethod.compare("int java.lang.String.length()") == 0) {
+        return genInlinedStringIsEmptyOrLength(cUnit, bb, mir, false /* isEmpty */);
+    }
+    if (tgtMethod.compare("boolean java.lang.String.isEmpty()") == 0) {
+        return genInlinedStringIsEmptyOrLength(cUnit, bb, mir, true /* isEmpty */);
+    }
+    if (tgtMethod.compare("int java.lang.Math.abs(int)") == 0) {
+        return genInlinedAbsInt(cUnit, bb, mir);
+    }
+    if (tgtMethod.compare("long java.lang.Math.abs(long)") == 0) {
+        return genInlinedAbsLong(cUnit, bb, mir);
+    }
+    if (tgtMethod.compare("int java.lang.Float.floatToRawIntBits(float)") == 0) {
+        return genInlinedFloatCvt(cUnit, bb, mir);
+    }
+    if (tgtMethod.compare("float java.lang.Float.intBitsToFloat(int)") == 0) {
+        return genInlinedFloatCvt(cUnit, bb, mir);
+    }
+    if (tgtMethod.compare("long java.lang.Double.doubleToRawLongBits(double)") == 0) {
+        return genInlinedDoubleCvt(cUnit, bb, mir);
+    }
+    if (tgtMethod.compare("double java.lang.Double.longBitsToDouble(long)") == 0) {
+        return genInlinedDoubleCvt(cUnit, bb, mir);
+    }
+    if (tgtMethod.compare("int java.lang.String.indexOf(int, int)") == 0) {
+        return genInlinedIndexOf(cUnit, bb, mir, type, false /* base 0 */);
+    }
+    if (tgtMethod.compare("int java.lang.String.indexOf(int)") == 0) {
+        return genInlinedIndexOf(cUnit, bb, mir, type, true /* base 0 */);
+    }
+    if (tgtMethod.compare("int java.lang.String.compareTo(java.lang.String)") == 0) {
+        return genInlinedStringCompareTo(cUnit, bb, mir, type);
+    }
+    return false;
+}
+
+
 }  // namespace art
diff --git a/src/compiler/codegen/MethodCodegenDriver.cc b/src/compiler/codegen/MethodCodegenDriver.cc
index 205a65a..87a86eb 100644
--- a/src/compiler/codegen/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/MethodCodegenDriver.cc
@@ -52,8 +52,12 @@
     return res;
 }
 
-void genInvoke(CompilationUnit* cUnit, MIR* mir, InvokeType type, bool isRange)
+void genInvoke(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
+               InvokeType type, bool isRange)
 {
+    if (genIntrinsic(cUnit, bb, mir, type, isRange)) {
+        return;
+    }
     DecodedInstruction* dInsn = &mir->dalvikInsn;
     InvokeType originalType = type;  // avoiding mutation by ComputeInvokeInfo
     int callState = 0;
@@ -543,38 +547,38 @@
             break;
 
         case Instruction::INVOKE_STATIC_RANGE:
-            genInvoke(cUnit, mir, kStatic, true /*range*/);
+            genInvoke(cUnit, bb, mir, kStatic, true /*range*/);
             break;
         case Instruction::INVOKE_STATIC:
-            genInvoke(cUnit, mir, kStatic, false /*range*/);
+            genInvoke(cUnit, bb, mir, kStatic, false /*range*/);
             break;
 
         case Instruction::INVOKE_DIRECT:
-            genInvoke(cUnit, mir, kDirect, false /*range*/);
+            genInvoke(cUnit, bb,  mir, kDirect, false /*range*/);
             break;
         case Instruction::INVOKE_DIRECT_RANGE:
-            genInvoke(cUnit, mir, kDirect, true /*range*/);
+            genInvoke(cUnit, bb, mir, kDirect, true /*range*/);
             break;
 
         case Instruction::INVOKE_VIRTUAL:
-            genInvoke(cUnit, mir, kVirtual, false /*range*/);
+            genInvoke(cUnit, bb, mir, kVirtual, false /*range*/);
             break;
         case Instruction::INVOKE_VIRTUAL_RANGE:
-            genInvoke(cUnit, mir, kVirtual, true /*range*/);
+            genInvoke(cUnit, bb, mir, kVirtual, true /*range*/);
             break;
 
         case Instruction::INVOKE_SUPER:
-            genInvoke(cUnit, mir, kSuper, false /*range*/);
+            genInvoke(cUnit, bb, mir, kSuper, false /*range*/);
             break;
         case Instruction::INVOKE_SUPER_RANGE:
-            genInvoke(cUnit, mir, kSuper, true /*range*/);
+            genInvoke(cUnit, bb, mir, kSuper, true /*range*/);
             break;
 
         case Instruction::INVOKE_INTERFACE:
-            genInvoke(cUnit, mir, kInterface, false /*range*/);
+            genInvoke(cUnit, bb, mir, kInterface, false /*range*/);
             break;
         case Instruction::INVOKE_INTERFACE_RANGE:
-            genInvoke(cUnit, mir, kInterface, true /*range*/);
+            genInvoke(cUnit, bb, mir, kInterface, true /*range*/);
             break;
 
         case Instruction::NEG_INT:
@@ -945,6 +949,8 @@
 
     handleThrowLaunchpads(cUnit);
 
+    handleIntrinsicLaunchpads(cUnit);
+
     if (!(cUnit->disableOpt & (1 << kSafeOptimizations))) {
         removeRedundantBranches(cUnit);
     }
diff --git a/src/compiler/codegen/arm/ArmLIR.h b/src/compiler/codegen/arm/ArmLIR.h
index 2d3028a..da39713 100644
--- a/src/compiler/codegen/arm/ArmLIR.h
+++ b/src/compiler/codegen/arm/ArmLIR.h
@@ -290,6 +290,7 @@
  * Assemble.cc.
  */
 enum ArmOpcode {
+    kPseudoIntrinsicRetry = -16,
     kPseudoSuspendTarget = -15,
     kPseudoThrowTarget = -14,
     kPseudoCaseLabel = -13,
diff --git a/src/compiler/codegen/arm/Thumb2/Factory.cc b/src/compiler/codegen/arm/Thumb2/Factory.cc
index c79f7c6..11d5bf4 100644
--- a/src/compiler/codegen/arm/Thumb2/Factory.cc
+++ b/src/compiler/codegen/arm/Thumb2/Factory.cc
@@ -681,6 +681,7 @@
                 opRegRegReg(cUnit, kOpAdd, regPtr, rBase, rIndex);
             }
             load = newLIR3(cUnit, opcode, rDest, regPtr, 0);
+            oatFreeTemp(cUnit, regPtr);
             return load;
         case kWord:
             opcode = (thumbForm) ? kThumbLdrRRR : kThumb2LdrRRR;
@@ -745,6 +746,7 @@
                 opRegRegReg(cUnit, kOpAdd, regPtr, rBase, rIndex);
             }
             store = newLIR3(cUnit, opcode, rSrc, regPtr, 0);
+            oatFreeTemp(cUnit, regPtr);
             return store;
         case kWord:
             opcode = (thumbForm) ? kThumbStrRRR : kThumb2StrRRR;
diff --git a/src/compiler/codegen/mips/MipsLIR.h b/src/compiler/codegen/mips/MipsLIR.h
index 5c8fc34..c0ff298 100644
--- a/src/compiler/codegen/mips/MipsLIR.h
+++ b/src/compiler/codegen/mips/MipsLIR.h
@@ -333,6 +333,7 @@
  * Assemble.cc.
  */
 enum MipsOpCode {
+    kPseudoIntrinsicRetry = -16,
     kPseudoSuspendTarget = -15,
     kPseudoThrowTarget = -14,
     kPseudoCaseLabel = -13,
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index 378c24d..1fc44b3 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -292,6 +292,7 @@
  * Assemble.cc.
  */
 enum X86OpCode {
+    kPseudoIntrinsicRetry = -16,
     kPseudoSuspendTarget = -15,
     kPseudoThrowTarget = -14,
     kPseudoCaseLabel = -13,
diff --git a/src/object.h b/src/object.h
index cfea1c9..61d0965 100644
--- a/src/object.h
+++ b/src/object.h
@@ -2179,16 +2179,28 @@
 // C++ mirror of java.lang.String
 class MANAGED String : public Object {
  public:
+
+  static MemberOffset CountOffset() {
+    return OFFSET_OF_OBJECT_MEMBER(String, count_);
+  }
+
+  static MemberOffset ValueOffset() {
+    return OFFSET_OF_OBJECT_MEMBER(String, array_);
+  }
+
+  static MemberOffset OffsetOffset() {
+    return OFFSET_OF_OBJECT_MEMBER(String, offset_);
+  }
+
   const CharArray* GetCharArray() const {
     const CharArray* result = GetFieldObject<const CharArray*>(
-        OFFSET_OF_OBJECT_MEMBER(String, array_), false);
+        ValueOffset(), false);
     DCHECK(result != NULL);
     return result;
   }
 
   int32_t GetOffset() const {
-    int32_t result = GetField32(
-        OFFSET_OF_OBJECT_MEMBER(String, offset_), false);
+    int32_t result = GetField32(OffsetOffset(), false);
     DCHECK_LE(0, result);
     return result;
   }
diff --git a/src/object_test.cc b/src/object_test.cc
index 3abe702..3512171 100644
--- a/src/object_test.cc
+++ b/src/object_test.cc
@@ -25,6 +25,7 @@
 #include "dex_file.h"
 #include "heap.h"
 #include "runtime_support.h"
+#include "asm_support.h"
 
 namespace art {
 
@@ -54,6 +55,14 @@
   }
 };
 
+// Keep the assembly code in sync
+TEST_F(ObjectTest, AsmConstants) {
+  ASSERT_EQ(STRING_VALUE_OFFSET, String::ValueOffset().Int32Value());
+  ASSERT_EQ(STRING_COUNT_OFFSET, String::CountOffset().Int32Value());
+  ASSERT_EQ(STRING_OFFSET_OFFSET, String::OffsetOffset().Int32Value());
+  ASSERT_EQ(STRING_DATA_OFFSET, Array::DataOffset(sizeof(uint16_t)).Int32Value());
+}
+
 TEST_F(ObjectTest, IsInSamePackage) {
   // Matches
   EXPECT_TRUE(Class::IsInSamePackage("Ljava/lang/Object;", "Ljava/lang/Class;"));
diff --git a/src/runtime_support.h b/src/runtime_support.h
index 0b9e0c4..35f88b5 100644
--- a/src/runtime_support.h
+++ b/src/runtime_support.h
@@ -59,6 +59,9 @@
 
 #if defined(__arm__)
   /* Compiler helpers */
+  extern "C" int32_t __memcmp16(void*, void*, int32_t);
+  extern "C" int32_t art_indexof(void*, uint32_t, uint32_t, uint32_t);
+  extern "C" int32_t art_string_compareto(void*, void*);
   extern "C" int32_t art_get32_static_from_code(uint32_t);
   extern "C" int64_t art_get64_static_from_code(uint32_t);
   extern "C" void* art_get_obj_static_from_code(uint32_t);
diff --git a/src/runtime_support_arm.S b/src/runtime_support_arm.S
index 606fa1f..653465a 100644
--- a/src/runtime_support_arm.S
+++ b/src/runtime_support_arm.S
@@ -804,3 +804,234 @@
     movpl   r0, r1, lsr ip              @  if r2 >= 32, r0<-r1 >>> (r2-32)
     mov     r1, r1, lsr r2              @  r1<- r1 >>> r2
     bx      lr
+
+    .balign 4
+    .global art_indexof
+art_indexof:
+    /*
+     * String's indexOf.
+     *
+     * On entry:
+     *    r0:   string object (known non-null)
+     *    r1:   char to match
+     *    r2:   Starting offset in string data
+     */
+
+    push {r4, r10-r11, lr} @ 4 words of callee saves
+    ldr   r3, [r0, #STRING_COUNT_OFFSET]
+    ldr   r12, [r0, #STRING_OFFSET_OFFSET]
+    ldr   r0, [r0, #STRING_VALUE_OFFSET]
+
+    /* Clamp start to [0..count] */
+    cmp   r2, #0
+    movlt r2, #0
+    cmp   r2, r3
+    movgt r2, r3
+
+    /* Build a pointer to the start of string data */
+    add   r0, #STRING_DATA_OFFSET
+    add   r0, r0, r12, lsl #1
+
+    /* Save a copy in r12 to later compute result */
+    mov   r12, r0
+
+    /* Build pointer to start of data to compare and pre-bias */
+    add   r0, r0, r2, lsl #1
+    sub   r0, #2
+
+    /* Compute iteration count */
+    sub   r2, r3, r2
+
+    /*
+     * At this point we have:
+     *   r0: start of data to test
+     *   r1: char to compare
+     *   r2: iteration count
+     *   r12: original start of string data
+     *   r3, r4, r10, r11 available for loading string data
+     */
+
+    subs  r2, #4
+    blt   indexof_remainder
+
+indexof_loop4:
+    ldrh  r3, [r0, #2]!
+    ldrh  r4, [r0, #2]!
+    ldrh  r10, [r0, #2]!
+    ldrh  r11, [r0, #2]!
+    cmp   r3, r1
+    beq   match_0
+    cmp   r4, r1
+    beq   match_1
+    cmp   r10, r1
+    beq   match_2
+    cmp   r11, r1
+    beq   match_3
+    subs  r2, #4
+    bge   indexof_loop4
+
+indexof_remainder:
+    adds    r2, #4
+    beq     indexof_nomatch
+
+indexof_loop1:
+    ldrh  r3, [r0, #2]!
+    cmp   r3, r1
+    beq   match_3
+    subs  r2, #1
+    bne   indexof_loop1
+
+indexof_nomatch:
+    mov   r0, #-1
+    pop {r4, r10-r11, pc}
+
+match_0:
+    sub   r0, #6
+    sub   r0, r12
+    asr   r0, r0, #1
+    pop {r4, r10-r11, pc}
+match_1:
+    sub   r0, #4
+    sub   r0, r12
+    asr   r0, r0, #1
+    pop {r4, r10-r11, pc}
+match_2:
+    sub   r0, #2
+    sub   r0, r12
+    asr   r0, r0, #1
+    pop {r4, r10-r11, pc}
+match_3:
+    sub   r0, r12
+    asr   r0, r0, #1
+    pop {r4, r10-r11, pc}
+
+
+   /*
+     * String's compareTo.
+     *
+     * Requires rARG0/rARG1 to have been previously checked for null.  Will
+     * return negative if this's string is < comp, 0 if they are the
+     * same and positive if >.
+     *
+     * On entry:
+     *    r0:   this object pointer
+     *    r1:   comp object pointer
+     *
+     */
+
+    .balign 4
+    .global art_string_compareto
+    .extern __memcmp16
+art_string_compareto:
+    mov    r2, r0         @ this to r2, opening up r0 for return value
+    subs   r0, r2, r1     @ Same?
+    bxeq   lr
+
+    push {r4, r7-r12, lr} @ 8 words - keep alignment
+
+    ldr    r4, [r2, #STRING_OFFSET_OFFSET]
+    ldr    r9, [r1, #STRING_OFFSET_OFFSET]
+    ldr    r7, [r2, #STRING_COUNT_OFFSET]
+    ldr    r10, [r1, #STRING_COUNT_OFFSET]
+    ldr    r2, [r2, #STRING_VALUE_OFFSET]
+    ldr    r1, [r1, #STRING_VALUE_OFFSET]
+
+    /*
+     * At this point, we have:
+     *    value:  r2/r1
+     *    offset: r4/r9
+     *    count:  r7/r10
+     * We're going to compute
+     *    r11 <- countDiff
+     *    r10 <- minCount
+     */
+     subs  r11, r7, r10
+     movls r10, r7
+
+     /* Now, build pointers to the string data */
+     add   r2, r2, r4, lsl #1
+     add   r1, r1, r9, lsl #1
+     /*
+      * Note: data pointers point to previous element so we can use pre-index
+      * mode with base writeback.
+      */
+     add   r2, #STRING_DATA_OFFSET-2   @ offset to contents[-1]
+     add   r1, #STRING_DATA_OFFSET-2   @ offset to contents[-1]
+
+     /*
+      * At this point we have:
+      *   r2: *this string data
+      *   r1: *comp string data
+      *   r10: iteration count for comparison
+      *   r11: value to return if the first part of the string is equal
+      *   r0: reserved for result
+      *   r3, r4, r7, r8, r9, r12 available for loading string data
+      */
+
+    subs  r10, #2
+    blt   do_remainder2
+
+      /*
+       * Unroll the first two checks so we can quickly catch early mismatch
+       * on long strings (but preserve incoming alignment)
+       */
+
+    ldrh  r3, [r2, #2]!
+    ldrh  r4, [r1, #2]!
+    ldrh  r7, [r2, #2]!
+    ldrh  r8, [r1, #2]!
+    subs  r0, r3, r4
+    subeqs  r0, r7, r8
+    bne   done
+    cmp   r10, #28
+    bgt   do_memcmp16
+    subs  r10, #3
+    blt   do_remainder
+
+loopback_triple:
+    ldrh  r3, [r2, #2]!
+    ldrh  r4, [r1, #2]!
+    ldrh  r7, [r2, #2]!
+    ldrh  r8, [r1, #2]!
+    ldrh  r9, [r2, #2]!
+    ldrh  r12,[r1, #2]!
+    subs  r0, r3, r4
+    subeqs  r0, r7, r8
+    subeqs  r0, r9, r12
+    bne   done
+    subs  r10, #3
+    bge   loopback_triple
+
+do_remainder:
+    adds  r10, #3
+    beq   returnDiff
+
+loopback_single:
+    ldrh  r3, [r2, #2]!
+    ldrh  r4, [r1, #2]!
+    subs  r0, r3, r4
+    bne   done
+    subs  r10, #1
+    bne     loopback_single
+
+returnDiff:
+    mov   r0, r11
+    pop   {r4, r7-r12, pc}
+
+do_remainder2:
+    adds  r10, #2
+    bne   loopback_single
+    mov   r0, r11
+    pop   {r4, r7-r12, pc}
+
+    /* Long string case */
+do_memcmp16:
+    mov   r7, r11
+    add   r0, r2, #2
+    add   r1, r1, #2
+    mov   r2, r10
+    bl    __memcmp16
+    cmp   r0, #0
+    moveq r0, r7
+done:
+    pop   {r4, r7-r12, pc}
diff --git a/src/thread.cc b/src/thread.cc
index 522024c..74165da 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -101,6 +101,9 @@
   pCmpgDouble = CmpgDouble;
   pCmplDouble = CmplDouble;
 #elif defined(__arm__)
+  pMemcmp16 = __memcmp16;
+  pIndexOf = art_indexof;
+  pStringCompareTo = art_string_compareto;
   pShlLong = art_shl_long;
   pShrLong = art_shr_long;
   pUshrLong = art_ushr_long;
diff --git a/src/thread.h b/src/thread.h
index a8d0812..8a0c1af 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -679,6 +679,9 @@
   bool (*pCmpgFloat)(float, float);
   bool (*pCmplDouble)(double, double);
   bool (*pCmpgDouble)(double, double);
+  int (*pIndexOf)(void*, uint32_t, uint32_t, uint32_t);
+  int (*pStringCompareTo)(void*, void*);
+  int (*pMemcmp16)(void*, void*, int32_t);
 
  private:
   DISALLOW_COPY_AND_ASSIGN(Thread);