Improved method invocation performance: 1.5x for virtual and 2.8x for interface.

- Implemented predicted chaining for invoke virtual and interface.
- Eliminated a little bit of fat for invoke native.
- Added 078-polymorphic-virtual for stress tests.
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
index 0831100..ff0a953 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
@@ -247,12 +247,6 @@
     bxne    lr                          @ bail to the interpreter
     tst     r10, #ACC_NATIVE
     bne     .LinvokeNative
-    /*
-     * If we want to punt to the interpreter for native call, swap the bne with
-     * the following
-     * bxne    lr
-     */
-
 
     ldr     r10, .LdvmJitToInterpNoChain
     ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
@@ -278,6 +272,8 @@
      * method through a dedicated chaining cell.
      */
     @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    @ methodToCall is guaranteed to be non-native
+.LinvokeChain:
     ldrh    r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
     ldrh    r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
     ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
@@ -293,7 +289,6 @@
     bxlt    r12                         @ return to raise stack overflow excep.
     @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
     ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
-    ldr     r10, [r0, #offMethod_accessFlags] @ r10<- methodToCall->accessFlags
     str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
     str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
     ldr     rPC, [r0, #offMethod_insns]     @ rPC<- methodToCall->insns
@@ -305,14 +300,6 @@
     str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
     cmp     r8, #0                      @ suspendCount != 0
     bxne    r12                         @ bail to the interpreter
-    tst     r10, #ACC_NATIVE
-    bne     .LinvokeNative
-    /*
-     * If we want to punt to the interpreter for native call, swap the bne with
-     * the following
-     * bxne    r12
-     */
-
 
     ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
     ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
@@ -329,6 +316,111 @@
 
 /* ------------------------------ */
     .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN
+dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S */
+    /*
+     * For polymorphic callsite, check whether the cached class pointer matches
+     * the current one. If so setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     *
+     * The predicted chaining cell is declared in Armv5teLIR.h with the
+     * following layout:
+     *
+     *  typedef struct PredictedChainingCell {
+     *      u4 branch;
+     *      const ClassObject *clazz;
+     *      const Method *method;
+     *      u4 counter;
+     *  } PredictedChainingCell;
+     *
+     * Upon returning to the callsite:
+     *    - lr  : to branch to the chaining cell
+     *    - lr+2: to punt to the interpreter
+     *    - lr+4: to fully resolve the callee and may rechain.
+     *            r3 <- class
+     *            r9 <- counter
+     */
+    @ r0 = this, r1 = returnCell, r2 = predictedChainCell, rPC = dalvikCallsite
+    ldr     r3, [r0, #offObject_clazz]  @ r3 <- this->class
+    ldr     r8, [r2, #4]    @ r8 <- predictedChainCell->clazz
+    ldr     r0, [r2, #8]    @ r0 <- predictedChainCell->method
+    ldr     r9, [r2, #12]   @ r9 <- predictedChainCell->counter
+    cmp     r3, r8          @ predicted class == actual class?
+    beq     .LinvokeChain   @ predicted chain is valid
+    ldr     r7, [r3, #offClassObject_vtable] @ r7 <- this->class->vtable
+    sub     r1, r9, #1      @ count--
+    str     r1, [r2, #12]   @ write back to PredictedChainingCell->counter
+    add     lr, lr, #4      @ return to fully-resolve landing pad
+    /*
+     * r1 <- count
+     * r2 <- &predictedChainCell
+     * r3 <- this->class
+     * r4 <- dPC
+     * r7 <- this->class->vtable
+     */
+    bx      lr
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    ldrh    r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    ldr     r8, [r8]                    @ r3<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlt    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+    ldr     rPC, [r0, #offMethod_insns]     @ rPC<- methodToCall->insns
+
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    ldr     r3, [rGLUE, #offGlue_self]      @ r3<- glue->self
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    ldr     r8, [r0, #offMethod_nativeFunc] @ r8<- method->nativeFunc
+    bxne    lr                          @ bail to the interpreter
+
+    @ go ahead and transfer control to the native code
+    ldr     r9, [r3, #offThread_jniLocal_nextEntry] @ r9<- thread->refNext
+    str     r1, [r3, #offThread_curFrame]   @ self->curFrame = newFp
+    str     r9, [r1, #(offStackSaveArea_localRefTop - sizeofStackSaveArea)]
+                                        @ newFp->localRefTop=refNext
+    mov     r9, r3                      @ r9<- glue->self (preserve)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
+
+    mov     r2, r0                      @ r2<- methodToCall
+    mov     r0, r1                      @ r0<- newFP
+    add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+
+    blx     r8                          @ off to the native code
+
+    @ native return; r9=self, r10=newSaveArea
+    @ equivalent to dvmPopJniLocals
+    ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
+    ldr     r0, [r10, #offStackSaveArea_localRefTop] @ r0<- newSave->localRefTop
+    ldr     r1, [r9, #offThread_exception] @ check for exception
+    str     rFP, [r9, #offThread_curFrame]  @ self->curFrame = fp
+    cmp     r1, #0                      @ null?
+    str     r0, [r9, #offThread_jniLocal_nextEntry] @ self->refNext<- r0
+    bne     .LhandleException             @ no, handle exception
+    bx      r2
+
+
+/* ------------------------------ */
+    .balign 4
     .global dvmCompiler_TEMPLATE_CMPG_DOUBLE
 dvmCompiler_TEMPLATE_CMPG_DOUBLE:
 /* File: armv5te/TEMPLATE_CMPG_DOUBLE.S */
@@ -1130,7 +1222,7 @@
     bne     .LhandleException             @ no, handle exception
     bx      r2
 
-/* FIXME - untested */
+/* NOTE - this path can be exercised if the JIT threshold is set to 5 */
 .LhandleException:
     ldr     r0, .LdvmMterpCommonExceptionThrown @ PIC way of getting &func
     ldr     rIBASE, .LdvmAsmInstructionStart    @ same as above