- Track vex r1494 (x86/amd64 change of conventions for getting
  to translations and back to dispatcher, and also different arg
  passing conventions to LibVEX_Translate).

- Rewrite x86 dispatcher to not increment the profiling counters
  unless requested by the user.  This dramatically reduces the
  D1 miss rate and gives considerable performance improvement
  on x86.  Also, restructure and add comments to dispatch-x86-linux.S
  to make it much easier to follow (imo).

amd64/ppc32/ppc64 fixes to follow.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5345 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/pub_core_dispatch.h b/coregrind/pub_core_dispatch.h
index d3b61b5..25e80cb 100644
--- a/coregrind/pub_core_dispatch.h
+++ b/coregrind/pub_core_dispatch.h
@@ -50,11 +50,23 @@
    signal, for example SIGSEGV, in which case control longjmp()s back past
    here.
 
+   If do_profiling is nonzero, the profile counters arrays should be
+   updated for each translation run.
+
    This code simply handles the common case fast -- when the translation
    address is found in the translation cache.  For anything else, the
    scheduler does the work.
 */
-extern UWord VG_(run_innerloop) ( void* guest_state );
+extern 
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+
+#if defined(VGA_x86) || defined(VGA_amd64)
+/* We need to locate a couple of labels inside VG_(run_innerloop), so
+   that Vex can add branches to them from generated code.  Hence the
+   following somewhat bogus decls.  At least on x86 and amd64. */
+extern void VG_(run_innerloop__dispatch_unprofiled);
+extern void VG_(run_innerloop__dispatch_profiled);
+#endif
 
 #endif   // __PUB_CORE_DISPATCH_H