More dispatcher tuning for ppc32/64.  Makes a big difference for
perf/tinycc.

- run_thread_for_a_while: just clear this thread's reservation when
  starting, not all of them.

- use a different fast-cache hashing function for ppc32/64 than for
  x86/amd64.  This allows the former to use all the fast-cache entries
  rather than just 1/4 of them.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5441 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/pub_core_transtab_asm.h b/coregrind/pub_core_transtab_asm.h
index 6041dcd..2462615 100644
--- a/coregrind/pub_core_transtab_asm.h
+++ b/coregrind/pub_core_transtab_asm.h
@@ -31,11 +31,31 @@
 #ifndef __PUB_CORE_TRANSTAB_ASM_H
 #define __PUB_CORE_TRANSTAB_ASM_H
 
-/* Constants for the fast translation lookup cache. */
+/* Constants for the fast translation lookup cache.  It is a direct
+   mapped cache, with 2^VG_TT_FAST_BITS entries.
+
+   On x86/amd64, the cache index is computed as
+   'address[VG_TT_FAST_BITS-1 : 0]'.
+
+   On ppc32/ppc64, the bottom two bits of instruction addresses are
+   zero, which means that function causes only 1/4 of the entries to
+   ever be used.  So instead the function is '(address >>u
+   2)[VG_TT_FAST_BITS-1 : 0]' on those targets. */
+
 #define VG_TT_FAST_BITS 15
 #define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS)
 #define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1)
 
+/* This macro isn't usable in asm land; nevertheless this seems
+   like a good place to put it. */
+#if defined(VGA_x86) || defined(VGA_amd64)
+#  define VG_TT_FAST_HASH(_addr)  ((((UWord)(_addr))     ) & VG_TT_FAST_MASK)
+#elif defined(VGA_ppc32) || defined(VGA_ppc64)
+#  define VG_TT_FAST_HASH(_addr)  ((((UWord)(_addr)) >> 2) & VG_TT_FAST_MASK)
+#else
+#  error "VG_TT_FAST_HASH: unknown platform"
+#endif
+
 #endif   // __PUB_CORE_TRANSTAB_ASM_H
 
 /*--------------------------------------------------------------------*/