Rollup changes for Helgrind:

* Add new client request VALGRIND_HG_CLEAN_MEMORY_HEAPBLOCK.  This is
  like VALGRIND_HG_CLEAN_MEMORY but doesn't take an address range.
  Instead it takes a single argument which is supposed to be a pointer
  to the start of, or anywhere within, a heap allocated block.
  Helgrind then finds the block and paints it as belonging to the
  calling thread.  This is needed for correctly describing the
  behaviour of threadsafe reference counting when applied to classes
  involving inheritance of release methods or involving multiple
  inheritance.

* Add statistics counters for all basic VTS operations (tick, join,
  cmpLEQ, cmp_structural).

* Rewrite VTS__cmp_structural to be much faster.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11123 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/helgrind/helgrind.h b/helgrind/helgrind.h
index 68bc225..2c30a5a 100644
--- a/helgrind/helgrind.h
+++ b/helgrind/helgrind.h
@@ -114,7 +114,8 @@
       _VG_USERREQ__HG_RESERVED4,              /* Do not use */
       _VG_USERREQ__HG_ARANGE_MAKE_UNTRACKED, /* Addr a, ulong len */
       _VG_USERREQ__HG_ARANGE_MAKE_TRACKED,   /* Addr a, ulong len */
-      _VG_USERREQ__HG_PTHREAD_BARRIER_RESIZE_PRE /* pth_bar_t*, ulong */
+      _VG_USERREQ__HG_PTHREAD_BARRIER_RESIZE_PRE, /* pth_bar_t*, ulong */
+      _VG_USERREQ__HG_CLEAN_MEMORY_HEAPBLOCK  /* Addr start_of_block */
 
    } Vg_TCheckClientRequest;
 
@@ -151,6 +152,17 @@
                                  _arg1, 0,0,0,0);        \
    } while (0)
 
+#define DO_CREQ_W_W(_resF, _dfltF, _creqF, _ty1F,_arg1F) \
+   do {                                                  \
+      long int _qzz_res, _arg1;                          \
+      /* assert(sizeof(_ty1F) == sizeof(long int)); */   \
+      _arg1 = (long int)(_arg1F);                        \
+      VALGRIND_DO_CLIENT_REQUEST(_qzz_res, (_dfltF),     \
+                                 (_creqF),               \
+                                 _arg1, 0,0,0,0);        \
+      _resF = _qzz_res;                                  \
+   } while (0)
+
 #define DO_CREQ_v_WW(_creqF, _ty1F,_arg1F, _ty2F,_arg2F) \
    do {                                                  \
       long int _unused_res, _arg1, _arg2;                \
@@ -307,6 +319,23 @@
                 void*,(_qzz_start),                          \
                 unsigned long,(_qzz_len))
 
+/* The same, but for the heap block starting at _qzz_blockstart.  This
+   allows painting when we only know the address of an object, but not
+   its size, which is sometimes the case in C++ code involving
+   inheritance, and in which RTTI is not, for whatever reason,
+   available.  Returns the number of bytes painted, which can be zero
+   for a zero-sized block.  Hence, return values >= 0 indicate success
+   (the block was found), and the value -1 indicates block not
+   found, and -2 is returned when not running on Helgrind. */
+#define VALGRIND_HG_CLEAN_MEMORY_HEAPBLOCK(_qzz_blockstart)  \
+   (__extension__                                            \
+   ({long int _npainted;                                     \
+     DO_CREQ_W_W(_npainted, (-2)/*default*/,                 \
+                 _VG_USERREQ__HG_CLEAN_MEMORY_HEAPBLOCK,     \
+                            void*,(_qzz_blockstart));        \
+     _npainted;                                              \
+   }))
+
 /* ----------------------------------------------------------
    For error control.
    ---------------------------------------------------------- */
diff --git a/helgrind/hg_errors.c b/helgrind/hg_errors.c
index 2c2fe24..a1ece96 100644
--- a/helgrind/hg_errors.c
+++ b/helgrind/hg_errors.c
@@ -276,11 +276,16 @@
       tl_assert(!xe->XE.Race.descr2);
 
       /* First, see if it's in any heap block.  Unfortunately this
-         means a linear search through all allocated heap blocks. */
-      HG_(mm_find_containing_block)( 
-         &xe->XE.Race.hctxt, &xe->XE.Race.haddr, &xe->XE.Race.hszB,
-         xe->XE.Race.data_addr
-      );
+         means a linear search through all allocated heap blocks.  The
+         assertion says that if it's detected as a heap block, then we
+         must have an allocation context for it, since all heap blocks
+         should have an allocation context. */
+      Bool is_heapblock
+         = HG_(mm_find_containing_block)( 
+              &xe->XE.Race.hctxt, &xe->XE.Race.haddr, &xe->XE.Race.hszB,
+              xe->XE.Race.data_addr
+           );
+      tl_assert(is_heapblock == (xe->XE.Race.hctxt != NULL));
 
       if (!xe->XE.Race.hctxt) {
          /* It's not in any heap block.  See if we can map it to a
diff --git a/helgrind/hg_errors.h b/helgrind/hg_errors.h
index 3b13021..3714346 100644
--- a/helgrind/hg_errors.h
+++ b/helgrind/hg_errors.h
@@ -68,9 +68,13 @@
 extern ULong HG_(stats__string_table_get_map_size) ( void );
 
 /* For error creation: map 'data_addr' to a malloc'd chunk, if any.
-   Slow linear search.  This is an abuse of the normal file structure
-   since this is exported by hg_main.c, not hg_errors.c.  Oh Well. */
-void HG_(mm_find_containing_block)( /*OUT*/ExeContext** where,
+   Slow linear search accelerated in some special cases normal hash
+   search of the mallocmeta table. This is an abuse of the normal file
+   structure since this is exported by hg_main.c, not hg_errors.c.  Oh
+   Well.  Returns True if found, False if not.  Zero-sized blocks are
+   considered to contain the searched-for address if they equal that
+   address. */
+Bool HG_(mm_find_containing_block)( /*OUT*/ExeContext** where,
                                     /*OUT*/Addr*        payload,
                                     /*OUT*/SizeT*       szB,
                                     Addr                data_addr );
diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c
index 7df5e47..ab9da5d 100644
--- a/helgrind/hg_main.c
+++ b/helgrind/hg_main.c
@@ -3879,32 +3879,62 @@
 
 
 /* For error creation: map 'data_addr' to a malloc'd chunk, if any.
-   Slow linear search. */
+   Slow linear search.  With a bit of hash table help if 'data_addr'
+   is either the start of a block or up to 15 word-sized steps along
+   from the start of a block. */
 
 static inline Bool addr_is_in_MM_Chunk( MallocMeta* mm, Addr a )
 {
-   if (a < mm->payload) return False;
-   if (a >= mm->payload + mm->szB) return False;
-   return True;
+   /* Accept 'a' as within 'mm' if 'mm's size is zero and 'a' points
+      right at it. */
+  if (UNLIKELY(mm->szB == 0 && a == mm->payload))
+     return True;
+  /* else normal interval rules apply */
+  if (LIKELY(a < mm->payload)) return False;
+  if (LIKELY(a >= mm->payload + mm->szB)) return False;
+  return True;
 }
 
-void HG_(mm_find_containing_block)( /*OUT*/ExeContext** where,
+Bool HG_(mm_find_containing_block)( /*OUT*/ExeContext** where,
                                     /*OUT*/Addr*        payload,
                                     /*OUT*/SizeT*       szB,
                                     Addr                data_addr )
 {
    MallocMeta* mm;
+   Int i;
+   const Int n_fast_check_words = 16;
+
+   /* First, do a few fast searches on the basis that data_addr might
+      be exactly the start of a block or up to 15 words inside.  This
+      can happen commonly via the creq
+      _VG_USERREQ__HG_CLEAN_MEMORY_HEAPBLOCK. */
+   for (i = 0; i < n_fast_check_words; i++) {
+      mm = VG_(HT_lookup)( hg_mallocmeta_table,
+                           data_addr - (UWord)(UInt)i * sizeof(UWord) );
+      if (UNLIKELY(mm && addr_is_in_MM_Chunk(mm, data_addr)))
+         goto found;
+   }
+
    /* Well, this totally sucks.  But without using an interval tree or
-      some such, it's hard to see how to do better. */
+      some such, it's hard to see how to do better.  We have to check
+      every block in the entire table. */
    VG_(HT_ResetIter)(hg_mallocmeta_table);
    while ( (mm = VG_(HT_Next)(hg_mallocmeta_table)) ) {
-      if (UNLIKELY(addr_is_in_MM_Chunk(mm, data_addr))) {
-         *where   = mm->where;
-         *payload = mm->payload;
-         *szB     = mm->szB;
-         return;
-      }
+      if (UNLIKELY(addr_is_in_MM_Chunk(mm, data_addr)))
+         goto found;
    }
+
+   /* Not found.  Bah. */
+   return False;
+   /*NOTREACHED*/
+
+  found:
+   tl_assert(mm);
+   tl_assert(addr_is_in_MM_Chunk(mm, data_addr));
+   if (where)   *where   = mm->where;
+   if (payload) *payload = mm->payload;
+   if (szB)     *szB     = mm->szB;
+   return True;
 }
 
 
@@ -4295,6 +4325,23 @@
          }
          break;
 
+      case _VG_USERREQ__HG_CLEAN_MEMORY_HEAPBLOCK: {
+         Addr  payload = 0;
+         SizeT pszB = 0;
+         if (0) VG_(printf)("VG_USERREQ__HG_CLEAN_MEMORY_HEAPBLOCK(%#lx)\n",
+                            args[1]);
+         if (HG_(mm_find_containing_block)(NULL, &payload, &pszB, args[1])) {
+            if (pszB > 0) {
+               evh__die_mem(payload, pszB);
+               evh__new_mem(payload, pszB);
+            }
+            *ret = pszB;
+         } else {
+            *ret = (UWord)-1;
+         }
+         break;
+      }
+
       case _VG_USERREQ__HG_ARANGE_MAKE_UNTRACKED:
          if (0) VG_(printf)("HG_ARANGE_MAKE_UNTRACKED(%#lx,%ld)\n",
                             args[1], args[2]);
diff --git a/helgrind/libhb_core.c b/helgrind/libhb_core.c
index 622d802..004ce69 100644
--- a/helgrind/libhb_core.c
+++ b/helgrind/libhb_core.c
@@ -341,6 +341,16 @@
 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
 static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
+static UWord stats__vts__tick            = 0; // # calls to VTS__tick
+static UWord stats__vts__join            = 0; // # calls to VTS__join
+static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
+static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
+static UWord stats__vts__cmp_structural_slow = 0; // # calls to VTS__cmp_structural w/ slow case
+static UWord stats__vts__indexat_slow    = 0; // # calls to VTS__indexAt_SLOW
+static UWord stats__vts_set__fadoa       = 0; // # calls to vts_set__find_and_dealloc__or_add
+static UWord stats__vts_set__fadoa_d     = 0; // # calls to vts_set__find_and_dealloc__or_add
+                                              // that lead to a deallocation
+
 
 static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
    return a & ~(N_SECMAP_ARANGE - 1);
@@ -1660,6 +1670,9 @@
    ScalarTS  tmp;
    VTS*      res;
    Word      i, n; 
+
+   stats__vts__tick++;
+
    tl_assert(me);
    tl_assert(is_sane_VTS(vts));
    //if (0) VG_(printf)("tick vts thrno %ld szin %d\n",
@@ -1730,6 +1743,8 @@
    Thr*     thr;
    VTS*     res;
 
+   stats__vts__join++;
+
    tl_assert(a && a->ts);
    tl_assert(b && b->ts);
    useda = VG_(sizeXA)( a->ts );
@@ -1821,6 +1836,8 @@
    Word  ia, ib, useda, usedb;
    ULong tyma, tymb;
 
+   stats__vts__cmpLEQ++;
+
    tl_assert(a && a->ts);
    tl_assert(b && b->ts);
    useda = VG_(sizeXA)( a->ts );
@@ -1903,37 +1920,59 @@
 
 /* Compute an arbitrary structural (total) ordering on the two args,
    based on their VCs, so they can be looked up in a table, tree, etc.
-   Returns -1, 0 or 1.  (really just 'deriving Ord' :-)
+   Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
+   performance critical so there is some effort expended to make it sa
+   fast as possible.
 */
 Word VTS__cmp_structural ( VTS* a, VTS* b )
 {
    /* We just need to generate an arbitrary total ordering based on
       a->ts and b->ts.  Preferably do it in a way which comes across likely
       differences relatively quickly. */
-   Word     i, useda, usedb;
-   ScalarTS *tmpa, *tmpb;
+   Word     i;
+   Word     useda = 0,    usedb = 0;
+   ScalarTS *ctsa = NULL, *ctsb = NULL;
 
-   tl_assert(a && a->ts);
-   tl_assert(b && b->ts);
-   useda = VG_(sizeXA)( a->ts );
-   usedb = VG_(sizeXA)( b->ts );
+   stats__vts__cmp_structural++;
+
+   tl_assert(a);
+   tl_assert(b);
+
+   VG_(getContentsXA_UNSAFE)( a->ts, (void**)&ctsa, &useda );
+   VG_(getContentsXA_UNSAFE)( b->ts, (void**)&ctsb, &usedb );
+
+   if (LIKELY(useda == usedb)) {
+      ScalarTS *tmpa = NULL, *tmpb = NULL;
+      stats__vts__cmp_structural_slow++;
+      /* Same length vectors.  Find the first difference, if any, as
+         fast as possible. */
+      for (i = 0; i < useda; i++) {
+         tmpa = &ctsa[i];
+         tmpb = &ctsb[i];
+         if (LIKELY(tmpa->tym == tmpb->tym && tmpa->thr == tmpb->thr))
+            continue;
+         else
+            break;
+      }
+      if (UNLIKELY(i == useda)) {
+         /* They're identical. */
+         return 0;
+      } else {
+         tl_assert(i >= 0 && i < useda);
+         if (tmpa->tym < tmpb->tym) return -1;
+         if (tmpa->tym > tmpb->tym) return 1;
+         if (tmpa->thr < tmpb->thr) return -1;
+         if (tmpa->thr > tmpb->thr) return 1;
+         /* we just established them as non-identical, hence: */
+      }
+      /*NOTREACHED*/
+      tl_assert(0);
+   }
 
    if (useda < usedb) return -1;
    if (useda > usedb) return 1;
-
-   /* Same length vectors, so let's step through them together. */
-   tl_assert(useda == usedb);
-   for (i = 0; i < useda; i++) {
-      tmpa = VG_(indexXA)( a->ts, i );
-      tmpb = VG_(indexXA)( b->ts, i );
-      if (tmpa->tym < tmpb->tym) return -1;
-      if (tmpa->tym > tmpb->tym) return 1;
-      if (tmpa->thr < tmpb->thr) return -1;
-      if (tmpa->thr > tmpb->thr) return 1;
-   }
-
-   /* They're identical. */
-   return 0;
+   /*NOTREACHED*/
+   tl_assert(0);
 }
 
 
@@ -1972,6 +2011,7 @@
 */
 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx ) {
    UWord i, n;
+   stats__vts__indexat_slow++;
    tl_assert(vts && vts->ts);
    n = VG_(sizeXA)( vts->ts );
    for (i = 0; i < n; i++) {
@@ -2029,12 +2069,14 @@
 static VTS* vts_set__find_and_dealloc__or_add ( VTS* cand )
 {
    UWord keyW, valW;
+   stats__vts_set__fadoa++;
    /* lookup cand (by value) */
    if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
       /* found it */
       tl_assert(valW == 0);
       /* if this fails, cand (by ref) was already present (!) */
       tl_assert(keyW != (UWord)cand);
+      stats__vts_set__fadoa_d++;
       VTS__delete(cand);
       return (VTS*)keyW;
    } else {
@@ -5469,9 +5511,9 @@
 
       VG_(printf)("%s","\n");
 
-      VG_(printf)("   libhb: %'13llu msmcread  (%'llu changed)\n",
+      VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
                   stats__msmcread, stats__msmcread_change);
-      VG_(printf)("   libhb: %'13llu msmcwrite (%'llu changed)\n",
+      VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
                   stats__msmcwrite, stats__msmcwrite_change);
       VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
                   stats__cmpLEQ_queries, stats__cmpLEQ_misses);
@@ -5479,6 +5521,16 @@
                   stats__join2_queries, stats__join2_misses);
 
       VG_(printf)("%s","\n");
+      VG_(printf)( "   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
+                   stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
+      VG_(printf)( "   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
+                   stats__vts__cmp_structural, stats__vts__cmp_structural_slow );
+      VG_(printf)( "   libhb: VTSset: find_and_dealloc__or_add %'lu (%'lu deallocd)\n",
+                   stats__vts_set__fadoa, stats__vts_set__fadoa_d );
+      VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
+                   stats__vts__indexat_slow );
+
+      VG_(printf)("%s","\n");
       VG_(printf)(
          "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
          VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)