Small improvements, including some commenting, for the H1 history
mechanism.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@10686 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/helgrind/libhb_core.c b/helgrind/libhb_core.c
index fab97ec..2b7d92c 100644
--- a/helgrind/libhb_core.c
+++ b/helgrind/libhb_core.c
@@ -2839,6 +2839,18 @@
 // QQQ move this somewhere else
 typedef  struct { ULong ull; ExeContext* ec; }  ULong_n_EC;
 
+/* How many of the above records to collect for each thread?  Older
+   ones are dumped when we run out of space.  62.5k requires 1MB per
+   thread, since each ULong_n_EC record is 16 bytes long.  When more
+   than N_KWs_N_STACKs_PER_THREAD are present, the older half are
+   deleted to make space.  Hence in the worst case we will be able to
+   produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
+   Kw transitions (segments in this thread).  For the current setting
+   that gives a guaranteed stack for at least the last 31.25k
+   segments. */
+#define N_KWs_N_STACKs_PER_THREAD 62500
+
+
 struct _Thr {
    /* Current VTSs for this thread.  They change as we go along.  viR
       is the VTS to be used for reads, viW for writes.  Usually they
@@ -2861,11 +2873,11 @@
    /* opaque (to us) data we hold on behalf of the library's user. */
    void* opaque;
 
-   /* The ULongs (scalar Krs) in this accumulate in strictly
+   /* The ULongs (scalar Kws) in this accumulate in strictly
       increasing order, without duplicates.  This is important because
-      we need to be able to find a given scalar Kr in this array
+      we need to be able to find a given scalar Kw in this array
       later, by binary search. */
-   XArray* /* ULong_n_EC */ local_Krs_n_stacks;
+   XArray* /* ULong_n_EC */ local_Kws_n_stacks;
 };
 
 static Thr* Thr__new ( void ) {
@@ -2874,13 +2886,13 @@
    thr->viW = VtsID_INVALID;
    thr->still_alive = True;
    thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
-   thr->local_Krs_n_stacks
-      = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.3 (local_Krs_and_stacks)",
+   thr->local_Kws_n_stacks
+      = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.3 (local_Kws_and_stacks)",
                     HG_(free), sizeof(ULong_n_EC) );
    return thr;
 }
 
-static void note_local_Kr_n_stack_for ( Thr* thr )
+static void note_local_Kw_n_stack_for ( Thr* thr )
 {
    Word       nPresent;
    ULong_n_EC pair;
@@ -2890,38 +2902,38 @@
    if (HG_(clo_history_level) != 1) 
       return;
 
-   /* This is the scalar Kr for thr. */
-   pair.ull = VtsID__indexAt( thr->viR, thr );
+   /* This is the scalar Kw for thr. */
+   pair.ull = VtsID__indexAt( thr->viW, thr );
    pair.ec  = main_get_EC( thr );
    tl_assert(pair.ec);
-   tl_assert(thr->local_Krs_n_stacks);
+   tl_assert(thr->local_Kws_n_stacks);
 
    /* check that we're not adding duplicates */
-   nPresent = VG_(sizeXA)( thr->local_Krs_n_stacks );
+   nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
 
    /* Throw away old stacks, if necessary.  We can't accumulate stuff
       indefinitely. */
-   if (nPresent > 10000) {
-      VG_(dropHeadXA)( thr->local_Krs_n_stacks, nPresent / 2 );
-      nPresent = VG_(sizeXA)( thr->local_Krs_n_stacks );
-      if (1)
-         VG_(printf)("LOCAL Kr: thr %p,  Kr %llu,  ec %p (!!! gc !!!)\n",
+   if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
+      VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
+      nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
+      if (0)
+         VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
                      thr, pair.ull, pair.ec );
    }
 
    if (nPresent > 0) {
       ULong_n_EC* prevPair
-         = (ULong_n_EC*)VG_(indexXA)( thr->local_Krs_n_stacks, nPresent-1 );
-      tl_assert( prevPair->ull < pair.ull );
+         = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
+      tl_assert( prevPair->ull <= pair.ull );
    }
 
    if (nPresent == 0)
       pair.ec = NULL;
 
-   VG_(addToXA)( thr->local_Krs_n_stacks, &pair );
+   VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
 
    if (0)
-      VG_(printf)("LOCAL Kr: thr %p,  Kr %llu,  ec %p\n",
+      VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
                   thr, pair.ull, pair.ec );
    if (0)
       VG_(pp_ExeContext)(pair.ec);
@@ -4047,10 +4059,56 @@
 static ULong stats__msmcwrite        = 0;
 static ULong stats__msmcwrite_change = 0;
 
+/* Some notes on the H1 history mechanism:
+
+   Transition rules are:
+
+   read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
+   write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
+
+   After any access by a thread T to a location L, L's constraint pair
+   (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
+
+   After a race by thread T conflicting with some previous access by
+   some other thread U, for a location with constraint (before
+   processing the later access) (Cr,Cw), then Cw[U] is the segment in
+   which the previously access lies.
+
+   Hence in record_race_info, we pass in Cfailed and Kfailed, which
+   are compared so as to find out which thread(s) this access
+   conflicts with.  Once that is established, we also require the
+   pre-update Cw for the location, so we can index into it for those
+   threads, to get the scalar clock values for the point at which the
+   former accesses were made.  (In fact we only bother to do any of
+   this for an arbitrarily chosen one of the conflicting threads, as
+   that's simpler, it avoids flooding the user with vast amounts of
+   mostly useless information, and because the program is wrong if it
+   contains any races at all -- so we don't really need to show all
+   conflicting access pairs initially, so long as we only show none if
+   none exist).
+
+   ---
+
+   That requires the auxiliary proof that 
+
+      (Cr `join` Kw)[T] == Kw[T]
+
+   Why should that be true?  Because for any thread T, Kw[T] >= the
+   scalar clock value for T known by any other thread.  In other
+   words, because T's value for its own scalar clock is at least as up
+   to date as the value for it known by any other thread (that is true
+   for both the R- and W- scalar clocks).  Hence no other thread will
+   be able to feed in a value for that element (indirectly via a
+   constraint) which will exceed Kw[T], and hence the join cannot
+   cause that particular element to advance.
+*/
+
 __attribute__((noinline))
 static void record_race_info ( Thr* acc_thr, 
                                Addr acc_addr, SizeT szB, Bool isWrite,
-                               VtsID vtsConstraint, VtsID vtsKlock )
+                               VtsID Cfailed,
+                               VtsID Kfailed,
+                               VtsID Cw )
 {
    /* Call here to report a race.  We just hand it onwards to
       HG_(record_error_Race).  If that in turn discovers that the
@@ -4078,8 +4136,8 @@
 
       /* At history_level 1, we must round up the relevant stack-pair
          for the conflicting segment right now.  This is because
-         deferring it is complex; we can't (easily) put vtsKlock and
-         vtsConstraint into the XError and wait for later without
+         deferring it is complex; we can't (easily) put Kfailed and
+         Cfailed into the XError and wait for later without
          getting tied up in difficulties with VtsID reference
          counting.  So just do it now. */
       Thr*  confThr;
@@ -4087,9 +4145,12 @@
       /* Which thread are we in conflict with?  There may be more than
          one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
          (in fact it's the one with the lowest Thr* value). */
-      confThr = VtsID__findFirst_notLEQ( vtsConstraint, vtsKlock );
+      confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
       /* This must exist!  since if it was NULL then there's no
-         conflict (semantics of return value of VtsID__findFirst_notLEQ) */
+         conflict (semantics of return value of
+         VtsID__findFirst_notLEQ), and msmc{read,write}, which has
+         called us, just checked exactly this -- that there was in
+         fact a race. */
       tl_assert(confThr);
 
       /* Get the scalar clock value that the conflicting thread
@@ -4098,7 +4159,7 @@
          conflicting thread's scalar clock when it created this
          constraint.  Hence we know the scalar clock of the
          conflicting thread when the conflicting access was made. */
-      confTym = VtsID__indexAt( vtsConstraint, confThr );
+      confTym = VtsID__indexAt( Cfailed, confThr );
 
       /* Using this scalar clock, index into the conflicting thread's
          collection of stack traces made each time its vector clock
@@ -4109,34 +4170,39 @@
       key.ull = confTym;
       key.ec  = NULL;
       /* tl_assert(confThr); -- asserted just above */
-      tl_assert(confThr->local_Krs_n_stacks);
+      tl_assert(confThr->local_Kws_n_stacks);
       firstIx = lastIx = 0;
       found = VG_(lookupXA_UNSAFE)(
-                 confThr->local_Krs_n_stacks,
+                 confThr->local_Kws_n_stacks,
                  &key, &firstIx, &lastIx,
                  (Int(*)(void*,void*))cmp__ULong_n_EC__by_ULong
               );
-      if (0) VG_(printf)("record_race_info %u %u  confThr %p "
+      if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
                          "confTym %llu found %d (%lu,%lu)\n", 
-                         vtsConstraint, vtsKlock,
+                         Cfailed, Kfailed, Cw,
                          confThr, confTym, found, firstIx, lastIx);
       /* We can't indefinitely collect stack traces at VTS
          transitions, since we'd eventually run out of memory.  Hence
-         note_local_Kr_n_stack_for will eventually throw away old
+         note_local_Kw_n_stack_for will eventually throw away old
          ones, which in turn means we might fail to find index value
          confTym in the array. */
       if (found) {
          ULong_n_EC *pair_start, *pair_end;
          pair_start 
-            = (ULong_n_EC*)VG_(indexXA)( confThr->local_Krs_n_stacks, lastIx );
+            = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
          hist1_seg_start = pair_start->ec;
-         if (lastIx+1 < VG_(sizeXA)( confThr->local_Krs_n_stacks )) {
+         if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
             pair_end
-               = (ULong_n_EC*)VG_(indexXA)( confThr->local_Krs_n_stacks,
+               = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
                                             lastIx+1 );
             /* from properties of VG_(lookupXA) and the comparison fn used: */
             tl_assert(pair_start->ull < pair_end->ull);
             hist1_seg_end = pair_end->ec;
+            /* Could do a bit better here.  It may be that pair_end
+               doesn't have a stack, but the following entries in the
+               array have the same scalar Kw and to have a stack.  So
+               we should search a bit further along the array than
+               lastIx+1 if hist1_seg_end is NULL. */
          } else {
             if (confThr->still_alive)
                hist1_seg_end = main_get_EC( confThr );
@@ -4194,7 +4260,9 @@
          // same as in non-race case
          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
          record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
-                           rmini, tviR );
+                           rmini, /* Cfailed */
+                           tviR,  /* Kfailed */
+                           wmini  /* Cw */ );
          goto out;
       }
    }
@@ -4265,7 +4333,9 @@
          svNew = SVal__mkC( VtsID__join2(rmini, tviW),
                             VtsID__join2(wmini, tviW) );
          record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
-                           wmini, acc_thr->viW );
+                           wmini, /* Cfailed */
+                           tviW,  /* Kfailed */
+                           wmini  /* Cw */ );
          goto out;
       }
    }
@@ -5186,9 +5256,9 @@
       at all, get one right now.  This is easier than figuring out
       exactly when at thread startup we can and can't take a stack
       snapshot. */
-   tl_assert(thr->local_Krs_n_stacks);
-   if (VG_(sizeXA)( thr->local_Krs_n_stacks ) == 0)
-      note_local_Kr_n_stack_for(thr);
+   tl_assert(thr->local_Kws_n_stacks);
+   if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
+      note_local_Kw_n_stack_for(thr);
 }
 
 
@@ -5300,7 +5370,7 @@
    Filter__clear(child->filter, "libhb_create(child)");
    VtsID__rcinc(child->viR);
    VtsID__rcinc(child->viW);
-   /* We need to do note_local_Kr_n_stack_for( child ), but it's too
+   /* We need to do note_local_Kw_n_stack_for( child ), but it's too
       early for that - it may not have a valid TId yet.  So, let
       libhb_Thr_resumes pick it up the first time the thread runs. */
 
@@ -5315,7 +5385,7 @@
    Filter__clear(parent->filter, "libhb_create(parent)");
    VtsID__rcinc(parent->viR);
    VtsID__rcinc(parent->viW);
-   note_local_Kr_n_stack_for( parent );
+   note_local_Kw_n_stack_for( parent );
 
    show_thread_state(" child", child);
    show_thread_state("parent", parent);
@@ -5452,7 +5522,7 @@
 {
    tl_assert(thr);
    thr->still_alive = False;
-   /* XXX free up Filter and local_Krs_n_stacks */
+   /* XXX free up Filter and local_Kws_n_stacks */
 }
 
 /* Both Segs and SOs point to VTSs.  However, there is no sharing, so
@@ -5515,7 +5585,7 @@
    thr->viW = VtsID__tick( thr->viW, thr );
    Filter__clear(thr->filter, "libhb_so_send");
    if (thr->still_alive)
-      note_local_Kr_n_stack_for(thr);
+      note_local_Kw_n_stack_for(thr);
    VtsID__rcinc(thr->viR);
    VtsID__rcinc(thr->viW);
 
@@ -5564,7 +5634,7 @@
       }
 
       Filter__clear(thr->filter, "libhb_so_recv");
-      note_local_Kr_n_stack_for(thr);
+      note_local_Kw_n_stack_for(thr);
 
       if (strong_recv) 
          show_thread_state("s-recv", thr);