Move some fast invoke checks to CanUseMterp

This speeds up arm64 golem interpreter benchmarks by 1.5%.

Test: test.py -b -r --interpreter --host
Change-Id: Ia9d7c885cd488de56c6b726373072070b509bdf1
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index b37a278..5784b9b 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -248,6 +248,14 @@
     bool from_deoptimize = false) REQUIRES_SHARED(Locks::mutator_lock_) {
   DCHECK(!shadow_frame.GetMethod()->IsAbstract());
   DCHECK(!shadow_frame.GetMethod()->IsNative());
+
+  // Check that we are using the right interpreter.
+  if (kIsDebugBuild && self->UseMterp() != CanUseMterp()) {
+    // The flag might be currently being updated on all threads. Retry with lock.
+    MutexLock tll_mu(self, *Locks::thread_list_lock_);
+    DCHECK_EQ(self->UseMterp(), CanUseMterp());
+  }
+
   if (LIKELY(!from_deoptimize)) {  // Entering the method, but not via deoptimization.
     if (kIsDebugBuild) {
       CHECK_EQ(shadow_frame.GetDexPC(), 0u);