Merge Helgrind from branches/YARD into the trunk.  Also includes some
minor changes to make stack unwinding on amd64-linux approximately
twice as fast as it was before.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@8707 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/helgrind/libhb_core.c b/helgrind/libhb_core.c
new file mode 100644
index 0000000..fc232f7
--- /dev/null
+++ b/helgrind/libhb_core.c
@@ -0,0 +1,4562 @@
+
+/*--------------------------------------------------------------------*/
+/*--- LibHB: a library for implementing and checking               ---*/
+/*--- the happens-before relationship in concurrent programs.      ---*/
+/*---                                                 libhb_main.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of LibHB, a library for implementing and checking
+   the happens-before relationship in concurrent programs.
+
+   Copyright (C) 2008-2008 OpenWorks Ltd
+      info@open-works.co.uk
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "pub_tool_basics.h"
+#include "pub_tool_libcassert.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_libcprint.h"
+#include "pub_tool_mallocfree.h"
+#include "pub_tool_wordfm.h"
+#include "pub_tool_xarray.h"
+#include "pub_tool_oset.h"
+#include "pub_tool_threadstate.h"
+#include "pub_tool_aspacemgr.h"
+#include "pub_tool_execontext.h"
+#include "pub_tool_errormgr.h"
+
+#include "hg_basics.h"
+#include "hg_wordset.h"
+#include "hg_lock_n_thread.h"
+#include "hg_errors.h"
+
+#include "libhb.h"
+
+
+/* fwds for
+   Globals needed by other parts of the library.  These are set
+   once at startup and then never changed. */
+static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
+static struct _EC* (*main_stacktrace_to_EC)( Addr*, UWord ) = NULL;
+static struct _EC* (*main_get_EC)( Thr* ) = NULL;
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+//                                                             //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// SECTION BEGIN compressed shadow memory                      //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+#ifndef __HB_ZSM_H
+#define __HB_ZSM_H
+
+typedef  ULong  SVal;
+
+/* This value has special significance to the implementation, and callers
+   may not store it in the shadow memory. */
+#define SVal_INVALID (3ULL << 62)
+
+/* This is the default value for shadow memory.  Initially the shadow
+   memory contains no accessible areas and so all reads produce this
+   value.  TODO: make this caller-defineable. */
+#define SVal_NOACCESS (2ULL << 62)
+
+/* Initialise the library.  Once initialised, it will (or may) call
+   rcinc and rcdec in response to all the calls below, in order to
+   allow the user to do reference counting on the SVals stored herein.
+   It is important to understand, however, that due to internal
+   caching, the reference counts are in general inaccurate, and can be
+   both above or below the true reference count for an item.  In
+   particular, the library may indicate that the reference count for
+   an item is zero, when in fact it is not.
+
+   To make the reference counting exact and therefore non-pointless,
+   call zsm_flush_cache.  Immediately after it returns, the reference
+   counts for all items, as deduced by the caller by observing calls
+   to rcinc and rcdec, will be correct, and so any items with a zero
+   reference count may be freed (or at least considered to be
+   unreferenced by this library).
+*/
+static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );
+
+static void zsm_set_range   ( Addr, SizeT, SVal );
+static SVal zsm_read8       ( Addr );
+static void zsm_copy_range  ( Addr, Addr, SizeT );
+static void zsm_flush_cache ( void );
+
+#endif /* ! __HB_ZSM_H */
+
+
+/* For the shadow mem cache stuff we may want more intrusive
+   checks.  Unfortunately there's no almost-zero-cost way to make them
+   selectable at run time.  Hence set the #if 0 to #if 1 and
+   rebuild if you want them. */
+#if 0
+#  define SCE_CACHELINE 1  /* do sanity-check CacheLine stuff */
+#  define inline __attribute__((noinline))
+   /* probably want to ditch -fomit-frame-pointer too */
+#else
+#  define SCE_CACHELINE 0   /* don't sanity-check CacheLine stuff */
+#endif
+
+/* For the SegmentID, SegmentSet and SVal stuff we may want more
+   intrusive checks.  Again there's no zero cost way to do this.  Set
+   the #if 0 to #if 1 and rebuild if you want them. */
+#if 0
+#  define SCE_SVALS 1 /* sanity-check shadow value stuff */
+#else
+#  define SCE_SVALS 0
+#endif
+
+
+/* Round a up to the next multiple of N.  N must be a power of 2 */
+#define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
+/* Round a down to the next multiple of N.  N must be a power of 2 */
+#define ROUNDDN(a, N)   ((a) & ~(N-1))
+
+
+
+/* ------ User-supplied RC functions ------ */
+static void(*rcinc)(SVal) = NULL;
+static void(*rcdec)(SVal) = NULL;
+
+
+/* ------ CacheLine ------ */
+
+#define N_LINE_BITS      6 /* must be >= 3 */
+#define N_LINE_ARANGE    (1 << N_LINE_BITS)
+#define N_LINE_TREES     (N_LINE_ARANGE >> 3)
+
+typedef
+   struct {
+      UShort descrs[N_LINE_TREES];
+      SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
+   }
+   CacheLine;
+
+#define TREE_DESCR_16_0 (1<<0)
+#define TREE_DESCR_32_0 (1<<1)
+#define TREE_DESCR_16_1 (1<<2)
+#define TREE_DESCR_64   (1<<3)
+#define TREE_DESCR_16_2 (1<<4)
+#define TREE_DESCR_32_1 (1<<5)
+#define TREE_DESCR_16_3 (1<<6)
+#define TREE_DESCR_8_0  (1<<7)
+#define TREE_DESCR_8_1  (1<<8)
+#define TREE_DESCR_8_2  (1<<9)
+#define TREE_DESCR_8_3  (1<<10)
+#define TREE_DESCR_8_4  (1<<11)
+#define TREE_DESCR_8_5  (1<<12)
+#define TREE_DESCR_8_6  (1<<13)
+#define TREE_DESCR_8_7  (1<<14)
+#define TREE_DESCR_DTY  (1<<15)
+
+typedef
+   struct {
+      SVal  dict[4]; /* can represent up to 4 diff values in the line */
+      UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
+                                      dict indexes */
+      /* if dict[0] == SVal_INVALID then dict[1] is the index of the
+         LineF to use, and dict[2..] are also SVal_INVALID. */
+   }
+   LineZ; /* compressed rep for a cache line */
+
+typedef
+   struct {
+      Bool inUse;
+      SVal w64s[N_LINE_ARANGE];
+   }
+   LineF; /* full rep for a cache line */
+
+/* Shadow memory.
+   Primary map is a WordFM Addr SecMap*.  
+   SecMaps cover some page-size-ish section of address space and hold
+     a compressed representation.
+   CacheLine-sized chunks of SecMaps are copied into a Cache, being
+   decompressed when moved into the cache and recompressed on the
+   way out.  Because of this, the cache must operate as a writeback
+   cache, not a writethrough one.
+
+   Each SecMap must hold a power-of-2 number of CacheLines.  Hence
+   N_SECMAP_BITS must >= N_LINE_BITS.
+*/
+#define N_SECMAP_BITS   13
+#define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
+
+// # CacheLines held by a SecMap
+#define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
+
+/* The data in the SecMap is held in the array of LineZs.  Each LineZ
+   either carries the required data directly, in a compressed
+   representation, or it holds (in .dict[0]) an index to the LineF in
+   .linesF that holds the full representation.
+
+   Currently-unused LineF's have their .inUse bit set to zero.
+   Since each in-use LineF is referred to be exactly one LineZ,
+   the number of .linesZ[] that refer to .linesF should equal
+   the number of .linesF[] that have .inUse == True.
+
+   RC obligations: the RCs presented to the user include exactly
+   the values in:
+   * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
+   * F reps that are in use (.inUse == True)
+
+   Hence the following actions at the following transitions are required:
+
+   F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
+   F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
+   Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
+   Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
+*/
+typedef
+   struct {
+      UInt   magic;
+      LineZ  linesZ[N_SECMAP_ZLINES];
+      LineF* linesF;
+      UInt   linesF_size;
+   }
+   SecMap;
+
+#define SecMap_MAGIC   0x571e58cbU
+
+static inline Bool is_sane_SecMap ( SecMap* sm ) {
+   return sm != NULL && sm->magic == SecMap_MAGIC;
+}
+
+/* ------ Cache ------ */
+
+#define N_WAY_BITS 16
+#define N_WAY_NENT (1 << N_WAY_BITS)
+
+/* Each tag is the address of the associated CacheLine, rounded down
+   to a CacheLine address boundary.  A CacheLine size must be a power
+   of 2 and must be 8 or more.  Hence an easy way to initialise the
+   cache so it is empty is to set all the tag values to any value % 8
+   != 0, eg 1.  This means all queries in the cache initially miss.
+   It does however require us to detect and not writeback, any line
+   with a bogus tag. */
+typedef
+   struct {
+      CacheLine lyns0[N_WAY_NENT];
+      Addr      tags0[N_WAY_NENT];
+   }
+   Cache;
+
+static inline Bool is_valid_scache_tag ( Addr tag ) {
+   /* a valid tag should be naturally aligned to the start of
+      a CacheLine. */
+   return 0 == (tag & (N_LINE_ARANGE - 1));
+}
+
+
+/* --------- Primary data structures --------- */
+
+/* Shadow memory primary map */
+static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
+static Cache   cache_shmem;
+
+
+static UWord stats__secmaps_search       = 0; // # SM finds
+static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
+static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
+static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
+static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
+static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
+static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
+static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
+static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
+static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
+static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
+static UWord stats__cache_F_fetches      = 0; // # F lines fetched
+static UWord stats__cache_F_wbacks       = 0; // # F lines written back
+static UWord stats__cache_invals         = 0; // # cache invals
+static UWord stats__cache_flushes        = 0; // # cache flushes
+static UWord stats__cache_totrefs        = 0; // # total accesses
+static UWord stats__cache_totmisses      = 0; // # misses
+static ULong stats__cache_make_New_arange = 0; // total arange made New
+static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
+static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
+static UWord stats__cline_read64s        = 0; // # calls to s_m_read64
+static UWord stats__cline_read32s        = 0; // # calls to s_m_read32
+static UWord stats__cline_read16s        = 0; // # calls to s_m_read16
+static UWord stats__cline_read8s         = 0; // # calls to s_m_read8
+static UWord stats__cline_write64s       = 0; // # calls to s_m_write64
+static UWord stats__cline_write32s       = 0; // # calls to s_m_write32
+static UWord stats__cline_write16s       = 0; // # calls to s_m_write16
+static UWord stats__cline_write8s        = 0; // # calls to s_m_write8
+static UWord stats__cline_set64s         = 0; // # calls to s_m_set64
+static UWord stats__cline_set32s         = 0; // # calls to s_m_set32
+static UWord stats__cline_set16s         = 0; // # calls to s_m_set16
+static UWord stats__cline_set8s          = 0; // # calls to s_m_set8
+static UWord stats__cline_get8s          = 0; // # calls to s_m_get8
+static UWord stats__cline_copy8s         = 0; // # calls to s_m_copy8
+static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
+static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
+static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
+static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
+static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
+static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
+
+static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
+   return a & ~(N_SECMAP_ARANGE - 1);
+}
+static inline UWord shmem__get_SecMap_offset ( Addr a ) {
+   return a & (N_SECMAP_ARANGE - 1);
+}
+
+
+/*----------------------------------------------------------------*/
+/*--- map_shmem :: WordFM Addr SecMap                          ---*/
+/*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
+/*----------------------------------------------------------------*/
+
+/*--------------- SecMap allocation --------------- */
+
+static HChar* shmem__bigchunk_next = NULL;
+static HChar* shmem__bigchunk_end1 = NULL;
+
+static void* shmem__bigchunk_alloc ( SizeT n )
+{
+   const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
+   tl_assert(n > 0);
+   n = VG_ROUNDUP(n, 16);
+   tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
+   tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
+             <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
+   if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
+      if (0)
+      VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
+                  (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
+      shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
+      if (shmem__bigchunk_next == NULL)
+         VG_(out_of_memory_NORETURN)(
+            "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
+      shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
+   }
+   tl_assert(shmem__bigchunk_next);
+   tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
+   tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
+   shmem__bigchunk_next += n;
+   return shmem__bigchunk_next - n;
+}
+
+static SecMap* shmem__alloc_SecMap ( void )
+{
+   Word    i, j;
+   SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
+   if (0) VG_(printf)("alloc_SecMap %p\n",sm);
+   tl_assert(sm);
+   sm->magic = SecMap_MAGIC;
+   for (i = 0; i < N_SECMAP_ZLINES; i++) {
+      sm->linesZ[i].dict[0] = SVal_NOACCESS;
+      sm->linesZ[i].dict[1] = SVal_INVALID;
+      sm->linesZ[i].dict[2] = SVal_INVALID;
+      sm->linesZ[i].dict[3] = SVal_INVALID;
+      for (j = 0; j < N_LINE_ARANGE/4; j++)
+         sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
+   }
+   sm->linesF      = NULL;
+   sm->linesF_size = 0;
+   stats__secmaps_allocd++;
+   stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
+   stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
+   stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
+   return sm;
+}
+
+typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
+static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
+
+static SecMap* shmem__find_SecMap ( Addr ga ) 
+{
+   SecMap* sm    = NULL;
+   Addr    gaKey = shmem__round_to_SecMap_base(ga);
+   // Cache
+   stats__secmaps_search++;
+   if (LIKELY(gaKey == smCache[0].gaKey))
+      return smCache[0].sm;
+   if (LIKELY(gaKey == smCache[1].gaKey)) {
+      SMCacheEnt tmp = smCache[0];
+      smCache[0] = smCache[1];
+      smCache[1] = tmp;
+      return smCache[0].sm;
+   }
+   if (gaKey == smCache[2].gaKey) {
+      SMCacheEnt tmp = smCache[1];
+      smCache[1] = smCache[2];
+      smCache[2] = tmp;
+      return smCache[1].sm;
+   }
+   // end Cache
+   stats__secmaps_search_slow++;
+   if (VG_(lookupFM)( map_shmem,
+                      NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
+      tl_assert(sm != NULL);
+      smCache[2] = smCache[1];
+      smCache[1] = smCache[0];
+      smCache[0].gaKey = gaKey;
+      smCache[0].sm    = sm;
+   } else {
+      tl_assert(sm == NULL);
+   }
+   return sm;
+}
+
+static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
+{
+   SecMap* sm = shmem__find_SecMap ( ga );
+   if (LIKELY(sm)) {
+      return sm;
+   } else {
+      /* create a new one */
+      Addr gaKey = shmem__round_to_SecMap_base(ga);
+      sm = shmem__alloc_SecMap();
+      tl_assert(sm);
+      VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
+      return sm;
+   }
+}
+
+
+/* ------------ LineF and LineZ related ------------ */
+
+static void rcinc_LineF ( LineF* lineF ) {
+   UWord i;
+   tl_assert(lineF->inUse);
+   for (i = 0; i < N_LINE_ARANGE; i++)
+      rcinc(lineF->w64s[i]);
+}
+
+static void rcdec_LineF ( LineF* lineF ) {
+   UWord i;
+   tl_assert(lineF->inUse);
+   for (i = 0; i < N_LINE_ARANGE; i++)
+      rcdec(lineF->w64s[i]);
+}
+
+static void rcinc_LineZ ( LineZ* lineZ ) {
+   tl_assert(lineZ->dict[0] != SVal_INVALID);
+   rcinc(lineZ->dict[0]);
+   if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
+   if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
+   if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
+}
+
+static void rcdec_LineZ ( LineZ* lineZ ) {
+   tl_assert(lineZ->dict[0] != SVal_INVALID);
+   rcdec(lineZ->dict[0]);
+   if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
+   if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
+   if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
+}
+
+inline
+static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
+   Word bix, shft, mask, prep;
+   tl_assert(ix >= 0);
+   bix  = ix >> 2;
+   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
+   mask = 3 << shft;
+   prep = b2 << shft;
+   arr[bix] = (arr[bix] & ~mask) | prep;
+}
+
+inline
+static UWord read_twobit_array ( UChar* arr, UWord ix ) {
+   Word bix, shft;
+   tl_assert(ix >= 0);
+   bix  = ix >> 2;
+   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
+   return (arr[bix] >> shft) & 3;
+}
+
+/* Given address 'tag', find either the Z or F line containing relevant
+   data, so it can be read into the cache.
+*/
+static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
+                                  /*OUT*/LineF** fp, Addr tag ) {
+   LineZ* lineZ;
+   LineF* lineF;
+   UWord   zix;
+   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
+   UWord   smoff = shmem__get_SecMap_offset(tag);
+   /* since smoff is derived from a valid tag, it should be
+      cacheline-aligned. */
+   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
+   zix = smoff >> N_LINE_BITS;
+   tl_assert(zix < N_SECMAP_ZLINES);
+   lineZ = &sm->linesZ[zix];
+   lineF = NULL;
+   if (lineZ->dict[0] == SVal_INVALID) {
+      UInt fix = (UInt)lineZ->dict[1];
+      tl_assert(sm->linesF);
+      tl_assert(sm->linesF_size > 0);
+      tl_assert(fix >= 0 && fix < sm->linesF_size);
+      lineF = &sm->linesF[fix];
+      tl_assert(lineF->inUse);
+      lineZ = NULL;
+   }
+   *zp = lineZ;
+   *fp = lineF;
+}
+
+/* Given address 'tag', return the relevant SecMap and the index of
+   the LineZ within it, in the expectation that the line is to be
+   overwritten.  Regardless of whether 'tag' is currently associated
+   with a Z or F representation, to rcdec on the current
+   representation, in recognition of the fact that the contents are
+   just about to be overwritten. */
+static __attribute__((noinline))
+void find_Z_for_writing ( /*OUT*/SecMap** smp,
+                          /*OUT*/Word* zixp,
+                          Addr tag ) {
+   LineZ* lineZ;
+   LineF* lineF;
+   UWord   zix;
+   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
+   UWord   smoff = shmem__get_SecMap_offset(tag);
+   /* since smoff is derived from a valid tag, it should be
+      cacheline-aligned. */
+   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
+   zix = smoff >> N_LINE_BITS;
+   tl_assert(zix < N_SECMAP_ZLINES);
+   lineZ = &sm->linesZ[zix];
+   lineF = NULL;
+   /* re RCs, we are freeing up this LineZ/LineF so that new data can
+      be parked in it.  Hence have to rcdec it accordingly. */
+   /* If lineZ has an associated lineF, free it up. */
+   if (lineZ->dict[0] == SVal_INVALID) {
+      UInt fix = (UInt)lineZ->dict[1];
+      tl_assert(sm->linesF);
+      tl_assert(sm->linesF_size > 0);
+      tl_assert(fix >= 0 && fix < sm->linesF_size);
+      lineF = &sm->linesF[fix];
+      tl_assert(lineF->inUse);
+      rcdec_LineF(lineF);
+      lineF->inUse = False;
+   } else {
+      rcdec_LineZ(lineZ);
+   }
+   *smp  = sm;
+   *zixp = zix;
+}
+
+static __attribute__((noinline))
+void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
+   UInt        i, new_size;
+   LineF* nyu;
+
+   if (sm->linesF) {
+      tl_assert(sm->linesF_size > 0);
+   } else {
+      tl_assert(sm->linesF_size == 0);
+   }
+
+   if (sm->linesF) {
+      for (i = 0; i < sm->linesF_size; i++) {
+         if (!sm->linesF[i].inUse) {
+            *fixp = (Word)i;
+            return;
+         }
+      }
+   }
+
+   /* No free F line found.  Expand existing array and try again. */
+   new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
+   nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
+                           new_size * sizeof(LineF) );
+   tl_assert(nyu);
+
+   stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
+   stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
+                                  * sizeof(LineF);
+
+   if (0)
+   VG_(printf)("SM %p: expand F array from %d to %d\n", 
+               sm, (Int)sm->linesF_size, new_size);
+
+   for (i = 0; i < new_size; i++)
+      nyu[i].inUse = False;
+
+   if (sm->linesF) {
+      for (i = 0; i < sm->linesF_size; i++) {
+         tl_assert(sm->linesF[i].inUse);
+         nyu[i] = sm->linesF[i];
+      }
+      VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
+      HG_(free)(sm->linesF);
+   }
+
+   sm->linesF      = nyu;
+   sm->linesF_size = new_size;
+
+   for (i = 0; i < sm->linesF_size; i++) {
+      if (!sm->linesF[i].inUse) {
+         *fixp = (Word)i;
+         return;
+      }
+    }
+
+    /*NOTREACHED*/
+    tl_assert(0);
+}
+
+
+/* ------------ CacheLine and implicit-tree related ------------ */
+
+__attribute__((unused))
+static void pp_CacheLine ( CacheLine* cl ) {
+   Word i;
+   if (!cl) {
+      VG_(printf)("%s","pp_CacheLine(NULL)\n");
+      return;
+   }
+   for (i = 0; i < N_LINE_TREES; i++) 
+      VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
+   for (i = 0; i < N_LINE_ARANGE; i++) 
+      VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
+}
+
+static UChar descr_to_validbits ( UShort descr )
+{
+   /* a.k.a Party Time for gcc's constant folder */
+#  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
+                b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
+             ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
+                          ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
+                          ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
+                          ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
+                          ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
+                          ( (b16_2) << 4)  | ( (b64)   << 3)  | \
+                          ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
+                          ( (b16_0) << 0) ) )
+
+#  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
+             ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
+                         ( (bit5) << 5) | ( (bit4) << 4) | \
+                         ( (bit3) << 3) | ( (bit2) << 2) | \
+                         ( (bit1) << 1) | ( (bit0) << 0) ) )
+
+   /* these should all get folded out at compile time */
+   tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
+   tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
+   tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
+   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
+   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
+   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
+   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
+   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
+   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
+
+   switch (descr) {
+   /*
+              +--------------------------------- TREE_DESCR_8_7
+              |             +------------------- TREE_DESCR_8_0
+              |             |  +---------------- TREE_DESCR_16_3
+              |             |  | +-------------- TREE_DESCR_32_1
+              |             |  | | +------------ TREE_DESCR_16_2
+              |             |  | | |  +--------- TREE_DESCR_64
+              |             |  | | |  |  +------ TREE_DESCR_16_1
+              |             |  | | |  |  | +---- TREE_DESCR_32_0
+              |             |  | | |  |  | | +-- TREE_DESCR_16_0
+              |             |  | | |  |  | | |
+              |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
+   case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
+                                                 return BYTE(1,1,1,1,1,1,1,1);
+   case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
+                                                 return BYTE(1,1,0,1,1,1,1,1);
+   case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */ 
+                                                 return BYTE(0,1,1,1,1,1,1,1);
+   case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
+                                                 return BYTE(0,1,0,1,1,1,1,1);
+
+   case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */ 
+                                                 return BYTE(1,1,1,1,1,1,0,1);
+   case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
+                                                 return BYTE(1,1,0,1,1,1,0,1);
+   case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
+                                                 return BYTE(0,1,1,1,1,1,0,1);
+   case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
+                                                 return BYTE(0,1,0,1,1,1,0,1);
+
+   case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
+                                                 return BYTE(1,1,1,1,0,1,1,1);
+   case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
+                                                 return BYTE(1,1,0,1,0,1,1,1);
+   case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
+                                                 return BYTE(0,1,1,1,0,1,1,1);
+   case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
+                                                 return BYTE(0,1,0,1,0,1,1,1);
+
+   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
+                                                 return BYTE(1,1,1,1,0,1,0,1);
+   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
+                                                 return BYTE(1,1,0,1,0,1,0,1);
+   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
+                                                 return BYTE(0,1,1,1,0,1,0,1);
+   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
+                                                 return BYTE(0,1,0,1,0,1,0,1);
+
+   case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
+                                                 return BYTE(0,0,0,1,1,1,1,1);
+   case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
+                                                 return BYTE(0,0,0,1,1,1,0,1);
+   case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
+                                                 return BYTE(0,0,0,1,0,1,1,1);
+   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
+                                                 return BYTE(0,0,0,1,0,1,0,1);
+
+   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
+                                                 return BYTE(1,1,1,1,0,0,0,1);
+   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
+                                                 return BYTE(1,1,0,1,0,0,0,1);
+   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
+                                                 return BYTE(0,1,1,1,0,0,0,1);
+   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
+                                                 return BYTE(0,1,0,1,0,0,0,1);
+
+   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
+                                                 return BYTE(0,0,0,1,0,0,0,1);
+
+   case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
+                                                 return BYTE(0,0,0,0,0,0,0,1);
+
+   default: return BYTE(0,0,0,0,0,0,0,0); 
+                   /* INVALID - any valid descr produces at least one
+                      valid bit in tree[0..7]*/
+   }
+   /* NOTREACHED*/
+   tl_assert(0);
+
+#  undef DESCR
+#  undef BYTE
+}
+
+__attribute__((unused))
+static Bool is_sane_Descr ( UShort descr ) {
+   return descr_to_validbits(descr) != 0;
+}
+
+static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
+   VG_(sprintf)(dst, 
+                "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
+                (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
+                (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
+                (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
+   );
+}
+static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
+   VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
+                     (Int)((byte & 128) ? 1 : 0),
+                     (Int)((byte &  64) ? 1 : 0),
+                     (Int)((byte &  32) ? 1 : 0),
+                     (Int)((byte &  16) ? 1 : 0),
+                     (Int)((byte &   8) ? 1 : 0),
+                     (Int)((byte &   4) ? 1 : 0),
+                     (Int)((byte &   2) ? 1 : 0),
+                     (Int)((byte &   1) ? 1 : 0)
+   );
+}
+
+static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
+   Word  i;
+   UChar validbits = descr_to_validbits(descr);
+   HChar buf[128], buf2[128];
+   if (validbits == 0)
+      goto bad;
+   for (i = 0; i < 8; i++) {
+      if (validbits & (1<<i)) {
+         if (tree[i] == SVal_INVALID)
+            goto bad;
+      } else {
+         if (tree[i] != SVal_INVALID)
+            goto bad;
+      }
+   }
+   return True;
+  bad:
+   sprintf_Descr( buf, descr );
+   sprintf_Byte( buf2, validbits );
+   VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
+   VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
+   VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
+   for (i = 0; i < 8; i++)
+      VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
+   VG_(printf)("%s","}\n");
+   return 0;
+}
+
+static Bool is_sane_CacheLine ( CacheLine* cl )
+{
+   Word tno, cloff;
+
+   if (!cl) goto bad;
+
+   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
+      UShort descr = cl->descrs[tno];
+      SVal*  tree  = &cl->svals[cloff];
+      if (!is_sane_Descr_and_Tree(descr, tree))
+         goto bad;
+   }
+   tl_assert(cloff == N_LINE_ARANGE);
+   return True;
+  bad:
+   pp_CacheLine(cl);
+   return False;
+}
+
+static UShort normalise_tree ( /*MOD*/SVal* tree )
+{
+   UShort descr;
+   /* pre: incoming tree[0..7] does not have any invalid shvals, in
+      particular no zeroes. */
+   if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
+                || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
+                || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
+                || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
+      tl_assert(0);
+   
+   descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
+           | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
+           | TREE_DESCR_8_1 | TREE_DESCR_8_0;
+   /* build 16-bit layer */
+   if (tree[1] == tree[0]) {
+      tree[1] = SVal_INVALID;
+      descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
+      descr |= TREE_DESCR_16_0;
+   }
+   if (tree[3] == tree[2]) {
+      tree[3] = SVal_INVALID;
+      descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
+      descr |= TREE_DESCR_16_1;
+   }
+   if (tree[5] == tree[4]) {
+      tree[5] = SVal_INVALID;
+      descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
+      descr |= TREE_DESCR_16_2;
+   }
+   if (tree[7] == tree[6]) {
+      tree[7] = SVal_INVALID;
+      descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
+      descr |= TREE_DESCR_16_3;
+   }
+   /* build 32-bit layer */
+   if (tree[2] == tree[0]
+       && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
+      tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
+      descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
+      descr |= TREE_DESCR_32_0;
+   }
+   if (tree[6] == tree[4]
+       && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
+      tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
+      descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
+      descr |= TREE_DESCR_32_1;
+   }
+   /* build 64-bit layer */
+   if (tree[4] == tree[0]
+       && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
+      tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
+      descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
+      descr |= TREE_DESCR_64;
+   }
+   return descr;
+}
+
+/* This takes a cacheline where all the data is at the leaves
+   (w8[..]) and builds a correctly normalised tree. */
+static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
+{
+   Word tno, cloff;
+   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
+      SVal* tree = &cl->svals[cloff];
+      cl->descrs[tno] = normalise_tree( tree );
+   }
+   tl_assert(cloff == N_LINE_ARANGE);
+   if (SCE_CACHELINE)
+      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   stats__cline_normalises++;
+}
+
+
+typedef struct { UChar count; SVal sval; } CountedSVal;
+
+static
+void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
+                               /*OUT*/Word* dstUsedP,
+                               Word nDst, CacheLine* src )
+{
+   Word  tno, cloff, dstUsed;
+
+   tl_assert(nDst == N_LINE_ARANGE);
+   dstUsed = 0;
+
+   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
+      UShort descr = src->descrs[tno];
+      SVal*  tree  = &src->svals[cloff];
+
+      /* sequentialise the tree described by (descr,tree). */
+#     define PUT(_n,_v)                                \
+         do { dst[dstUsed  ].count = (_n);             \
+              dst[dstUsed++].sval  = (_v);             \
+         } while (0)
+
+      /* byte 0 */
+      if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
+      if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
+      if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
+      if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
+      /* byte 1 */
+      if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
+      /* byte 2 */
+      if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
+      if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
+      /* byte 3 */
+      if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
+      /* byte 4 */
+      if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
+      if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
+      if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
+      /* byte 5 */
+      if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
+      /* byte 6 */
+      if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
+      if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
+      /* byte 7 */
+      if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
+
+#     undef PUT
+      /* END sequentialise the tree described by (descr,tree). */
+
+   }
+   tl_assert(cloff == N_LINE_ARANGE);
+   tl_assert(dstUsed <= nDst);
+
+   *dstUsedP = dstUsed;
+}
+
+/* Write the cacheline 'wix' to backing store.  Where it ends up
+   is determined by its tag field. */
+static __attribute__((noinline)) void cacheline_wback ( UWord wix )
+{
+   Word        i, j, k, m;
+   Addr        tag;
+   SecMap*     sm;
+   CacheLine*  cl;
+   LineZ* lineZ;
+   LineF* lineF;
+   Word        zix, fix, csvalsUsed;
+   CountedSVal csvals[N_LINE_ARANGE];
+   SVal        sv;
+
+   if (0)
+   VG_(printf)("scache wback line %d\n", (Int)wix);
+
+   tl_assert(wix >= 0 && wix < N_WAY_NENT);
+
+   tag =  cache_shmem.tags0[wix];
+   cl  = &cache_shmem.lyns0[wix];
+
+   /* The cache line may have been invalidated; if so, ignore it. */
+   if (!is_valid_scache_tag(tag))
+      return;
+
+   /* Where are we going to put it? */
+   sm         = NULL;
+   lineZ      = NULL;
+   lineF      = NULL;
+   zix = fix = -1;
+
+   /* find the Z line to write in and rcdec it or the associated F
+      line. */
+   find_Z_for_writing( &sm, &zix, tag );
+
+   tl_assert(sm);
+   tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
+   lineZ = &sm->linesZ[zix];
+
+   /* Generate the data to be stored */
+   if (SCE_CACHELINE)
+      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+
+   csvalsUsed = -1;
+   sequentialise_CacheLine( csvals, &csvalsUsed, 
+                            N_LINE_ARANGE, cl );
+   tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
+   if (0) VG_(printf)("%lu ", csvalsUsed);
+
+   lineZ->dict[0] = lineZ->dict[1] 
+                  = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
+
+   /* i indexes actual shadow values, k is cursor in csvals */
+   i = 0;
+   for (k = 0; k < csvalsUsed; k++) {
+
+      sv = csvals[k].sval;
+      if (SCE_SVALS)
+         tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
+      /* do we already have it? */
+      if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
+      if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
+      if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
+      if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
+      /* no.  look for a free slot. */
+      if (SCE_SVALS)
+         tl_assert(sv != SVal_INVALID);
+      if (lineZ->dict[0] 
+          == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
+      if (lineZ->dict[1]
+          == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
+      if (lineZ->dict[2]
+          == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
+      if (lineZ->dict[3]
+          == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
+      break; /* we'll have to use the f rep */
+     dict_ok:
+      m = csvals[k].count;
+      if (m == 8) {
+         write_twobit_array( lineZ->ix2s, i+0, j );
+         write_twobit_array( lineZ->ix2s, i+1, j );
+         write_twobit_array( lineZ->ix2s, i+2, j );
+         write_twobit_array( lineZ->ix2s, i+3, j );
+         write_twobit_array( lineZ->ix2s, i+4, j );
+         write_twobit_array( lineZ->ix2s, i+5, j );
+         write_twobit_array( lineZ->ix2s, i+6, j );
+         write_twobit_array( lineZ->ix2s, i+7, j );
+         i += 8;
+      }
+      else if (m == 4) {
+         write_twobit_array( lineZ->ix2s, i+0, j );
+         write_twobit_array( lineZ->ix2s, i+1, j );
+         write_twobit_array( lineZ->ix2s, i+2, j );
+         write_twobit_array( lineZ->ix2s, i+3, j );
+         i += 4;
+      }
+      else if (m == 1) {
+         write_twobit_array( lineZ->ix2s, i+0, j );
+         i += 1;
+      }
+      else if (m == 2) {
+         write_twobit_array( lineZ->ix2s, i+0, j );
+         write_twobit_array( lineZ->ix2s, i+1, j );
+         i += 2;
+      }
+      else {
+         tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
+      }
+
+   }
+
+   if (LIKELY(i == N_LINE_ARANGE)) {
+      /* Construction of the compressed representation was
+         successful. */
+      rcinc_LineZ(lineZ);
+      stats__cache_Z_wbacks++;
+   } else {
+      /* Cannot use the compressed(z) representation.  Use the full(f)
+         rep instead. */
+      tl_assert(i >= 0 && i < N_LINE_ARANGE);
+      alloc_F_for_writing( sm, &fix );
+      tl_assert(sm->linesF);
+      tl_assert(sm->linesF_size > 0);
+      tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
+      lineF = &sm->linesF[fix];
+      tl_assert(!lineF->inUse);
+      lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
+      lineZ->dict[1] = (SVal)fix;
+      lineF->inUse = True;
+      i = 0;
+      for (k = 0; k < csvalsUsed; k++) {
+         if (SCE_SVALS)
+            tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
+         sv = csvals[k].sval;
+         if (SCE_SVALS)
+            tl_assert(sv != SVal_INVALID);
+         for (m = csvals[k].count; m > 0; m--) {
+            lineF->w64s[i] = sv;
+            i++;
+         }
+      }
+      tl_assert(i == N_LINE_ARANGE);
+      rcinc_LineF(lineF);
+      stats__cache_F_wbacks++;
+   }
+
+   //if (anyShared)
+   //   sm->mbHasShared = True;
+
+   /* mb_tidy_one_cacheline(); */
+}
+
+/* Fetch the cacheline 'wix' from the backing store.  The tag
+   associated with 'wix' is assumed to have already been filled in;
+   hence that is used to determine where in the backing store to read
+   from. */
+static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
+{
+   Word       i;
+   Addr       tag;
+   CacheLine* cl;
+   LineZ*     lineZ;
+   LineF*     lineF;
+
+   if (0)
+   VG_(printf)("scache fetch line %d\n", (Int)wix);
+
+   tl_assert(wix >= 0 && wix < N_WAY_NENT);
+
+   tag =  cache_shmem.tags0[wix];
+   cl  = &cache_shmem.lyns0[wix];
+
+   /* reject nonsense requests */
+   tl_assert(is_valid_scache_tag(tag));
+
+   lineZ = NULL;
+   lineF = NULL;
+   find_ZF_for_reading( &lineZ, &lineF, tag );
+   tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
+
+   /* expand the data into the bottom layer of the tree, then get
+      cacheline_normalise to build the descriptor array. */
+   if (lineF) {
+      tl_assert(lineF->inUse);
+      for (i = 0; i < N_LINE_ARANGE; i++) {
+         cl->svals[i] = lineF->w64s[i];
+      }
+      stats__cache_F_fetches++;
+   } else {
+      for (i = 0; i < N_LINE_ARANGE; i++) {
+         SVal sv;
+         UWord ix = read_twobit_array( lineZ->ix2s, i );
+         /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
+         sv = lineZ->dict[ix];
+         tl_assert(sv != SVal_INVALID);
+         cl->svals[i] = sv;
+      }
+      stats__cache_Z_fetches++;
+   }
+   normalise_CacheLine( cl );
+}
+
+static void shmem__invalidate_scache ( void ) {
+   Word wix;
+   if (0) VG_(printf)("%s","scache inval\n");
+   tl_assert(!is_valid_scache_tag(1));
+   for (wix = 0; wix < N_WAY_NENT; wix++) {
+      cache_shmem.tags0[wix] = 1/*INVALID*/;
+   }
+   stats__cache_invals++;
+}
+
+static void shmem__flush_and_invalidate_scache ( void ) {
+   Word wix;
+   Addr tag;
+   if (0) VG_(printf)("%s","scache flush and invalidate\n");
+   tl_assert(!is_valid_scache_tag(1));
+   for (wix = 0; wix < N_WAY_NENT; wix++) {
+      tag = cache_shmem.tags0[wix];
+      if (tag == 1/*INVALID*/) {
+         /* already invalid; nothing to do */
+      } else {
+         tl_assert(is_valid_scache_tag(tag));
+         cacheline_wback( wix );
+      }
+      cache_shmem.tags0[wix] = 1/*INVALID*/;
+   }
+   stats__cache_flushes++;
+   stats__cache_invals++;
+}
+
+
+static inline Bool aligned16 ( Addr a ) {
+   return 0 == (a & 1);
+}
+static inline Bool aligned32 ( Addr a ) {
+   return 0 == (a & 3);
+}
+static inline Bool aligned64 ( Addr a ) {
+   return 0 == (a & 7);
+}
+static inline UWord get_cacheline_offset ( Addr a ) {
+   return (UWord)(a & (N_LINE_ARANGE - 1));
+}
+static inline Addr cacheline_ROUNDUP ( Addr a ) {
+   return ROUNDUP(a, N_LINE_ARANGE);
+}
+static inline Addr cacheline_ROUNDDN ( Addr a ) {
+   return ROUNDDN(a, N_LINE_ARANGE);
+}
+static inline UWord get_treeno ( Addr a ) {
+   return get_cacheline_offset(a) >> 3;
+}
+static inline UWord get_tree_offset ( Addr a ) {
+   return a & 7;
+}
+
+static __attribute__((noinline))
+       CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
+static inline CacheLine* get_cacheline ( Addr a )
+{
+   /* tag is 'a' with the in-line offset masked out, 
+      eg a[31]..a[4] 0000 */
+   Addr       tag = a & ~(N_LINE_ARANGE - 1);
+   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
+   stats__cache_totrefs++;
+   if (LIKELY(tag == cache_shmem.tags0[wix])) {
+      return &cache_shmem.lyns0[wix];
+   } else {
+      return get_cacheline_MISS( a );
+   }
+}
+
+static __attribute__((noinline))
+       CacheLine* get_cacheline_MISS ( Addr a )
+{
+   /* tag is 'a' with the in-line offset masked out, 
+      eg a[31]..a[4] 0000 */
+
+   CacheLine* cl;
+   Addr*      tag_old_p;
+   Addr       tag = a & ~(N_LINE_ARANGE - 1);
+   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
+
+   tl_assert(tag != cache_shmem.tags0[wix]);
+
+   /* Dump the old line into the backing store. */
+   stats__cache_totmisses++;
+
+   cl        = &cache_shmem.lyns0[wix];
+   tag_old_p = &cache_shmem.tags0[wix];
+
+   if (is_valid_scache_tag( *tag_old_p )) {
+      /* EXPENSIVE and REDUNDANT: callee does it */
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+      cacheline_wback( wix );
+   }
+   /* and reload the new one */
+   *tag_old_p = tag;
+   cacheline_fetch( wix );
+   if (SCE_CACHELINE)
+      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   return cl;
+}
+
+static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
+   stats__cline_64to32pulldown++;
+   switch (toff) {
+      case 0: case 4:
+         tl_assert(descr & TREE_DESCR_64);
+         tree[4] = tree[0];
+         descr &= ~TREE_DESCR_64;
+         descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
+         break;
+      default:
+         tl_assert(0);
+   }
+   return descr;
+}
+
+static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
+   stats__cline_32to16pulldown++;
+   switch (toff) {
+      case 0: case 2:
+         if (!(descr & TREE_DESCR_32_0)) {
+            descr = pulldown_to_32(tree, 0, descr);
+         }
+         tl_assert(descr & TREE_DESCR_32_0);
+         tree[2] = tree[0];
+         descr &= ~TREE_DESCR_32_0;
+         descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
+         break;
+      case 4: case 6:
+         if (!(descr & TREE_DESCR_32_1)) {
+            descr = pulldown_to_32(tree, 4, descr);
+         }
+         tl_assert(descr & TREE_DESCR_32_1);
+         tree[6] = tree[4];
+         descr &= ~TREE_DESCR_32_1;
+         descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
+         break;
+      default:
+         tl_assert(0);
+   }
+   return descr;
+}
+
+static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
+   stats__cline_16to8pulldown++;
+   switch (toff) {
+      case 0: case 1:
+         if (!(descr & TREE_DESCR_16_0)) {
+            descr = pulldown_to_16(tree, 0, descr);
+         }
+         tl_assert(descr & TREE_DESCR_16_0);
+         tree[1] = tree[0];
+         descr &= ~TREE_DESCR_16_0;
+         descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
+         break;
+      case 2: case 3:
+         if (!(descr & TREE_DESCR_16_1)) {
+            descr = pulldown_to_16(tree, 2, descr);
+         }
+         tl_assert(descr & TREE_DESCR_16_1);
+         tree[3] = tree[2];
+         descr &= ~TREE_DESCR_16_1;
+         descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
+         break;
+      case 4: case 5:
+         if (!(descr & TREE_DESCR_16_2)) {
+            descr = pulldown_to_16(tree, 4, descr);
+         }
+         tl_assert(descr & TREE_DESCR_16_2);
+         tree[5] = tree[4];
+         descr &= ~TREE_DESCR_16_2;
+         descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
+         break;
+      case 6: case 7:
+         if (!(descr & TREE_DESCR_16_3)) {
+            descr = pulldown_to_16(tree, 6, descr);
+         }
+         tl_assert(descr & TREE_DESCR_16_3);
+         tree[7] = tree[6];
+         descr &= ~TREE_DESCR_16_3;
+         descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
+         break;
+      default:
+         tl_assert(0);
+   }
+   return descr;
+}
+
+
+static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
+   UShort mask;
+   switch (toff) {
+      case 0:
+         mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
+         tl_assert( (descr & mask) == mask );
+         descr &= ~mask;
+         descr |= TREE_DESCR_16_0;
+         break;
+      case 2:
+         mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
+         tl_assert( (descr & mask) == mask );
+         descr &= ~mask;
+         descr |= TREE_DESCR_16_1;
+         break;
+      case 4:
+         mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
+         tl_assert( (descr & mask) == mask );
+         descr &= ~mask;
+         descr |= TREE_DESCR_16_2;
+         break;
+      case 6:
+         mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
+         tl_assert( (descr & mask) == mask );
+         descr &= ~mask;
+         descr |= TREE_DESCR_16_3;
+         break;
+      default:
+         tl_assert(0);
+   }
+   return descr;
+}
+
+static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
+   UShort mask;
+   switch (toff) {
+      case 0:
+         if (!(descr & TREE_DESCR_16_0))
+            descr = pullup_descr_to_16(descr, 0);
+         if (!(descr & TREE_DESCR_16_1))
+            descr = pullup_descr_to_16(descr, 2);
+         mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
+         tl_assert( (descr & mask) == mask );
+         descr &= ~mask;
+         descr |= TREE_DESCR_32_0;
+         break;
+      case 4:
+         if (!(descr & TREE_DESCR_16_2))
+            descr = pullup_descr_to_16(descr, 4);
+         if (!(descr & TREE_DESCR_16_3))
+            descr = pullup_descr_to_16(descr, 6);
+         mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
+         tl_assert( (descr & mask) == mask );
+         descr &= ~mask;
+         descr |= TREE_DESCR_32_1;
+         break;
+      default:
+         tl_assert(0);
+   }
+   return descr;
+}
+
+static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
+   switch (toff) {
+      case 0: case 4:
+         return 0 != (descr & TREE_DESCR_64);
+      default:
+         tl_assert(0);
+   }
+}
+
+static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
+   switch (toff) {
+      case 0:
+         return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
+      case 2:
+         return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
+      case 4:
+         return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
+      case 6:
+         return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
+      default:
+         tl_assert(0);
+   }
+}
+
+/* ------------ Cache management ------------ */
+
+static void zsm_flush_cache ( void )
+{
+   shmem__flush_and_invalidate_scache();
+}
+
+
+static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
+{
+   tl_assert( sizeof(UWord) == sizeof(Addr) );
+
+   rcinc = p_rcinc;
+   rcdec = p_rcdec;
+
+   tl_assert(map_shmem == NULL);
+   map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
+                           HG_(free), 
+                           NULL/*unboxed UWord cmp*/);
+   tl_assert(map_shmem != NULL);
+   shmem__invalidate_scache();
+
+   /* a SecMap must contain an integral number of CacheLines */
+   tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
+   /* also ... a CacheLine holds an integral number of trees */
+   tl_assert(0 == (N_LINE_ARANGE % 8));
+}
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// SECTION END compressed shadow memory                        //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// SECTION BEGIN vts primitives                                //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+#ifndef __HB_VTS_H
+#define __HB_VTS_H
+
+/* VtsIDs can't exceed 30 bits, since they have to be packed into the
+   lowest 30 bits of an SVal. */
+typedef  UInt  VtsID;
+#define VtsID_INVALID 0xFFFFFFFF
+
+/* A VTS contains .ts, its vector clock, and also .id, a field to hold
+   a backlink for the caller's convenience.  Since we have no idea
+   what to set that to in the library, it always gets set to
+   VtsID_INVALID. */
+typedef
+   struct {
+      VtsID   id;
+      XArray* ts; /* XArray* ScalarTS(abstract) */
+   }
+   VTS;
+
+
+/* Create a new, empty VTS. */
+VTS* VTS__new ( void );
+
+/* Delete this VTS in its entirety. */
+void VTS__delete ( VTS* vts );
+
+/* Create a new singleton VTS. */
+VTS* VTS__singleton ( Thr* thr, ULong tym );
+
+/* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
+   not modified. */
+VTS* VTS__tick ( Thr* me, VTS* vts );
+
+/* Return a new VTS constructed as the join (max) of the 2 args.
+   Neither arg is modified. */
+VTS* VTS__join ( VTS* a, VTS* b );
+
+/* Compute the partial ordering relation of the two args. */
+typedef
+   enum { POrd_EQ=4, POrd_LT, POrd_GT, POrd_UN }
+   POrd;
+
+POrd VTS__cmp ( VTS* a, VTS* b );
+
+/* Compute an arbitrary structural (total) ordering on the two args,
+   based on their VCs, so they can be looked up in a table, tree, etc.
+   Returns -1, 0 or 1. */
+Word VTS__cmp_structural ( VTS* a, VTS* b );
+
+/* Debugging only.  Display the given VTS in the buffer. */
+void VTS__show ( HChar* buf, Int nBuf, VTS* vts );
+
+/* Debugging only.  Return vts[index], so to speak. */
+ULong VTS__indexAt_SLOW ( VTS* vts, Thr* index );
+
+#endif /* ! __HB_VTS_H */
+
+
+/*--------------- to do with Vector Timestamps ---------------*/
+
+/* Scalar Timestamp */
+typedef
+   struct {
+      Thr*    thr;
+      ULong   tym;
+   }
+   ScalarTS;
+
+
+static Bool is_sane_VTS ( VTS* vts )
+{
+   UWord     i, n;
+   ScalarTS  *st1, *st2;
+   if (!vts) return False;
+   if (!vts->ts) return False;
+   n = VG_(sizeXA)( vts->ts );
+   if (n >= 2) {
+      for (i = 0; i < n-1; i++) {
+         st1 = VG_(indexXA)( vts->ts, i );
+         st2 = VG_(indexXA)( vts->ts, i+1 );
+         if (st1->thr >= st2->thr)
+            return False;
+         if (st1->tym == 0 || st2->tym == 0)
+            return False;
+      }
+   }
+   return True;
+}
+
+
+/* Create a new, empty VTS.
+*/
+VTS* VTS__new ( void )
+{
+   VTS* vts;
+   vts = HG_(zalloc)( "libhb.VTS__new.1", sizeof(VTS) );
+   tl_assert(vts);
+   vts->id = VtsID_INVALID;
+   vts->ts = VG_(newXA)( HG_(zalloc), "libhb.VTS__new.2",
+                         HG_(free), sizeof(ScalarTS) );
+   tl_assert(vts->ts);
+   return vts;
+}
+
+
+/* Delete this VTS in its entirety.
+*/
+void VTS__delete ( VTS* vts )
+{
+   tl_assert(vts);
+   tl_assert(vts->ts);
+   VG_(deleteXA)( vts->ts );
+   HG_(free)(vts);
+}
+
+
+/* Create a new singleton VTS. 
+*/
+VTS* VTS__singleton ( Thr* thr, ULong tym ) {
+   ScalarTS st;
+   VTS*     vts;
+   tl_assert(thr);
+   tl_assert(tym >= 1);
+   vts = VTS__new();
+   st.thr = thr;
+   st.tym = tym;
+   VG_(addToXA)( vts->ts, &st );
+   return vts;
+}
+
+
+/* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
+   not modified.
+*/
+VTS* VTS__tick ( Thr* me, VTS* vts )
+{
+   ScalarTS* here = NULL;
+   ScalarTS  tmp;
+   VTS*      res;
+   Word      i, n; 
+   tl_assert(me);
+   tl_assert(is_sane_VTS(vts));
+   //if (0) VG_(printf)("tick vts thrno %ld szin %d\n",
+   //                   (Word)me->errmsg_index, (Int)VG_(sizeXA)(vts) );
+   res = VTS__new();
+   n = VG_(sizeXA)( vts->ts );
+
+   /* main loop doesn't handle zero-entry case correctly, so
+      special-case it. */
+   if (n == 0) {
+      tmp.thr = me;
+      tmp.tym = 1;
+      VG_(addToXA)( res->ts, &tmp );
+      tl_assert(is_sane_VTS(res));
+      return res;
+   }
+
+   for (i = 0; i < n; i++) {
+      here = VG_(indexXA)( vts->ts, i );
+      if (me < here->thr) {
+         /* We just went past 'me', without seeing it. */
+         tmp.thr = me;
+         tmp.tym = 1;
+         VG_(addToXA)( res->ts, &tmp );
+         tmp = *here;
+         VG_(addToXA)( res->ts, &tmp );
+         i++;
+         break;
+      } 
+      else if (me == here->thr) {
+         tmp = *here;
+         tmp.tym++;
+         VG_(addToXA)( res->ts, &tmp );
+         i++;
+         break;
+      }
+      else /* me > here->thr */ {
+         tmp = *here;
+         VG_(addToXA)( res->ts, &tmp );
+      }
+   }
+   tl_assert(i >= 0 && i <= n);
+   if (i == n && here && here->thr < me) {
+      tmp.thr = me;
+      tmp.tym = 1;
+      VG_(addToXA)( res->ts, &tmp );
+   } else {
+      for (/*keepgoing*/; i < n; i++) {
+         here = VG_(indexXA)( vts->ts, i );
+         tmp = *here;
+         VG_(addToXA)( res->ts, &tmp );
+      }
+   }
+   tl_assert(is_sane_VTS(res));
+   //if (0) VG_(printf)("tick vts thrno %ld szou %d\n",
+   //                   (Word)me->errmsg_index, (Int)VG_(sizeXA)(res) );
+   return res;
+}
+
+
+/* Return a new VTS constructed as the join (max) of the 2 args.
+   Neither arg is modified.
+*/
+VTS* VTS__join ( VTS* a, VTS* b )
+{
+   Word     ia, ib, useda, usedb;
+   ULong    tyma, tymb, tymMax;
+   Thr*     thr;
+   VTS*     res;
+   ScalarTS *tmpa, *tmpb;
+
+   tl_assert(a && a->ts);
+   tl_assert(b && b->ts);
+   useda = VG_(sizeXA)( a->ts );
+   usedb = VG_(sizeXA)( b->ts );
+
+   res = VTS__new();
+   ia = ib = 0;
+
+   while (1) {
+
+      /* This logic is to enumerate triples (thr, tyma, tymb) drawn
+         from a and b in order, where thr is the next Thr*
+         occurring in either a or b, and tyma/b are the relevant
+         scalar timestamps, taking into account implicit zeroes. */
+      tl_assert(ia >= 0 && ia <= useda);
+      tl_assert(ib >= 0 && ib <= usedb);
+      tmpa = tmpb = NULL;
+
+      if (ia == useda && ib == usedb) {
+         /* both empty - done */
+         break;
+      }
+      else
+      if (ia == useda && ib != usedb) {
+         /* a empty, use up b */
+         tmpb = VG_(indexXA)( b->ts, ib );
+         thr  = tmpb->thr;
+         tyma = 0;
+         tymb = tmpb->tym;
+         ib++;
+      }
+      else
+      if (ia != useda && ib == usedb) {
+         /* b empty, use up a */
+         tmpa = VG_(indexXA)( a->ts, ia );
+         thr  = tmpa->thr;
+         tyma = tmpa->tym;
+         tymb = 0;
+         ia++;
+      }
+      else {
+         /* both not empty; extract lowest-Thr*'d triple */
+         tmpa = VG_(indexXA)( a->ts, ia );
+         tmpb = VG_(indexXA)( b->ts, ib );
+         if (tmpa->thr < tmpb->thr) {
+            /* a has the lowest unconsidered Thr* */
+            thr  = tmpa->thr;
+            tyma = tmpa->tym;
+            tymb = 0;
+            ia++;
+         }
+         else
+         if (tmpa->thr > tmpb->thr) {
+            /* b has the lowest unconsidered Thr* */
+            thr  = tmpb->thr;
+            tyma = 0;
+            tymb = tmpb->tym;
+            ib++;
+         } else {
+            /* they both next mention the same Thr* */
+            tl_assert(tmpa->thr == tmpb->thr);
+            thr  = tmpa->thr; /* == tmpb->thr */
+            tyma = tmpa->tym;
+            tymb = tmpb->tym;
+            ia++;
+            ib++;
+         }
+      }
+
+      /* having laboriously determined (thr, tyma, tymb), do something
+         useful with it. */
+      tymMax = tyma > tymb ? tyma : tymb;
+      if (tymMax > 0) {
+         ScalarTS st;
+         st.thr = thr;
+         st.tym = tymMax;
+         VG_(addToXA)( res->ts, &st );
+      }
+
+   }
+
+   tl_assert(is_sane_VTS( res ));
+
+   return res;
+}
+
+
+/* Compute the partial ordering relation of the two args.
+*/
+POrd VTS__cmp ( VTS* a, VTS* b )
+{
+   Word     ia, ib, useda, usedb;
+   ULong    tyma, tymb;
+   Thr*     thr;
+   ScalarTS *tmpa, *tmpb;
+
+   Bool all_leq = True;
+   Bool all_geq = True;
+
+   tl_assert(a && a->ts);
+   tl_assert(b && b->ts);
+   useda = VG_(sizeXA)( a->ts );
+   usedb = VG_(sizeXA)( b->ts );
+
+   ia = ib = 0;
+
+   while (1) {
+
+      /* This logic is to enumerate triples (thr, tyma, tymb) drawn
+         from a and b in order, where thr is the next Thr*
+         occurring in either a or b, and tyma/b are the relevant
+         scalar timestamps, taking into account implicit zeroes. */
+      tl_assert(ia >= 0 && ia <= useda);
+      tl_assert(ib >= 0 && ib <= usedb);
+      tmpa = tmpb = NULL;
+
+      if (ia == useda && ib == usedb) {
+         /* both empty - done */
+         break;
+      }
+      else
+      if (ia == useda && ib != usedb) {
+         /* a empty, use up b */
+         tmpb = VG_(indexXA)( b->ts, ib );
+         thr  = tmpb->thr;
+         tyma = 0;
+         tymb = tmpb->tym;
+         ib++;
+      }
+      else
+      if (ia != useda && ib == usedb) {
+         /* b empty, use up a */
+         tmpa = VG_(indexXA)( a->ts, ia );
+         thr  = tmpa->thr;
+         tyma = tmpa->tym;
+         tymb = 0;
+         ia++;
+      }
+      else {
+         /* both not empty; extract lowest-Thr*'d triple */
+         tmpa = VG_(indexXA)( a->ts, ia );
+         tmpb = VG_(indexXA)( b->ts, ib );
+         if (tmpa->thr < tmpb->thr) {
+            /* a has the lowest unconsidered Thr* */
+            thr  = tmpa->thr;
+            tyma = tmpa->tym;
+            tymb = 0;
+            ia++;
+         }
+         else
+         if (tmpa->thr > tmpb->thr) {
+            /* b has the lowest unconsidered Thr* */
+            thr  = tmpb->thr;
+            tyma = 0;
+            tymb = tmpb->tym;
+            ib++;
+         } else {
+            /* they both next mention the same Thr* */
+            tl_assert(tmpa->thr == tmpb->thr);
+            thr  = tmpa->thr; /* == tmpb->thr */
+            tyma = tmpa->tym;
+            tymb = tmpb->tym;
+            ia++;
+            ib++;
+         }
+      }
+
+      /* having laboriously determined (thr, tyma, tymb), do something
+         useful with it. */
+      if (tyma < tymb)
+         all_geq = False;
+      if (tyma > tymb)
+         all_leq = False;
+   }
+
+   if (all_leq && all_geq)
+      return POrd_EQ;
+   /* now we know they aren't equal, so either all_leq or all_geq or
+      both are false. */
+   if (all_leq)
+      return POrd_LT;
+   if (all_geq)
+      return POrd_GT;
+   /* hmm, neither all_geq or all_leq.  This means unordered. */
+   return POrd_UN;
+}
+
+
+/* Compute an arbitrary structural (total) ordering on the two args,
+   based on their VCs, so they can be looked up in a table, tree, etc.
+   Returns -1, 0 or 1.  (really just 'deriving Ord' :-)
+*/
+Word VTS__cmp_structural ( VTS* a, VTS* b )
+{
+   /* We just need to generate an arbitrary total ordering based on
+      a->ts and b->ts.  Preferably do it in a way which comes across likely
+      differences relatively quickly. */
+   Word     i, useda, usedb;
+   ScalarTS *tmpa, *tmpb;
+
+   tl_assert(a && a->ts);
+   tl_assert(b && b->ts);
+   useda = VG_(sizeXA)( a->ts );
+   usedb = VG_(sizeXA)( b->ts );
+
+   if (useda < usedb) return -1;
+   if (useda > usedb) return 1;
+
+   /* Same length vectors, so let's step through them together. */
+   tl_assert(useda == usedb);
+   for (i = 0; i < useda; i++) {
+      tmpa = VG_(indexXA)( a->ts, i );
+      tmpb = VG_(indexXA)( b->ts, i );
+      if (tmpa->tym < tmpb->tym) return -1;
+      if (tmpa->tym > tmpb->tym) return 1;
+      if (tmpa->thr < tmpb->thr) return -1;
+      if (tmpa->thr > tmpb->thr) return 1;
+   }
+
+   /* They're identical. */
+   return 0;
+}
+
+
+/* Debugging only.  Display the given VTS in the buffer.
+*/
+void VTS__show ( HChar* buf, Int nBuf, VTS* vts ) {
+   ScalarTS* st;
+   HChar     unit[64];
+   Word      i, n;
+   Int       avail = nBuf;
+   tl_assert(vts && vts->ts);
+   tl_assert(nBuf > 16);
+   buf[0] = '[';
+   buf[1] = 0;
+   n = VG_(sizeXA)( vts->ts );
+   for (i = 0; i < n; i++) {
+      tl_assert(avail >= 40);
+      st = VG_(indexXA)( vts->ts, i );
+      VG_(memset)(unit, 0, sizeof(unit));
+      VG_(sprintf)(unit, i < n-1 ? "%p:%lld " : "%p:%lld",
+                         st->thr, st->tym);
+      if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
+         VG_(strcat)(buf, " ...]");
+         buf[nBuf-1] = 0;
+         return;
+      }
+      VG_(strcat)(buf, unit);
+      avail -= VG_(strlen)(unit);
+   }
+   VG_(strcat)(buf, "]");
+   buf[nBuf-1] = 0;
+}
+
+
+/* Debugging only.  Return vts[index], so to speak.
+*/
+ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx ) {
+   UWord i, n;
+   tl_assert(vts && vts->ts);
+   n = VG_(sizeXA)( vts->ts );
+   for (i = 0; i < n; i++) {
+      ScalarTS* st = VG_(indexXA)( vts->ts, i );
+      if (st->thr == idx)
+         return st->tym;
+   }
+   return 0;
+}
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// SECTION END vts primitives                                  //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// SECTION BEGIN main library                                  //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// VTS set                                             //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+static WordFM* /* VTS* void void */ vts_set = NULL;
+
+static void vts_set_init ( void )
+{
+   tl_assert(!vts_set);
+   vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
+                         HG_(free),
+                         (Word(*)(UWord,UWord))VTS__cmp_structural );
+   tl_assert(vts_set);
+}
+
+/* Given a newly made VTS, look in vts_set to see if we already have
+   an identical one.  If yes, free up this one and return instead a
+   pointer to the existing one.  If no, add this one to the set and
+   return the same pointer.  Caller differentiates the two cases by
+   comparing returned pointer with the supplied one (although that
+   does require that the supplied VTS is not already in the set).
+*/
+static VTS* vts_set__find_and_dealloc__or_add ( VTS* cand )
+{
+   UWord keyW, valW;
+   /* lookup cand (by value) */
+   if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
+      /* found it */
+      tl_assert(valW == 0);
+      /* if this fails, cand (by ref) was already present (!) */
+      tl_assert(keyW != (UWord)cand);
+      VTS__delete(cand);
+      return (VTS*)keyW;
+   } else {
+      /* not present.  Add and return pointer to same. */
+      VG_(addToFM)( vts_set, (UWord)cand, 0/*val is unused*/ );
+      return cand;
+   }
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// VTS table                                           //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+static void VtsID__invalidate_caches ( void ); /* fwds */
+
+/* A type to hold VTS table entries.  Invariants:
+   If .vts == NULL, then this entry is not in use, so:
+   - .rc == 0
+   - this entry is on the freelist (unfortunately, does not imply
+     any constraints on value for .nextfree)
+   If .vts != NULL, then this entry is in use:
+   - .vts is findable in vts_set
+   - .vts->id == this entry number
+   - no specific value for .rc (even 0 is OK)
+   - this entry is not on freelist, so .nextfree == VtsID_INVALID
+*/
+typedef
+   struct {
+      VTS*  vts;      /* vts, in vts_set */
+      UWord rc;       /* reference count - enough for entire aspace */
+      VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
+   }
+   VtsTE;
+
+/* The VTS table. */
+static XArray* /* of VtsTE */ vts_tab = NULL;
+
+/* An index into the VTS table, indicating the start of the list of
+   free (available for use) entries.  If the list is empty, this is
+   VtsID_INVALID. */
+static VtsID vts_tab_freelist = VtsID_INVALID;
+
+/* Do a GC of vts_tab when the freelist becomes empty AND the size of
+   vts_tab equals or exceeds this size.  After GC, the value here is
+   set appropriately so as to check for the next GC point. */
+static Word vts_next_GC_at = 1000;
+
+static void vts_tab_init ( void )
+{
+   vts_tab
+      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
+                    HG_(free), sizeof(VtsTE) );
+   vts_tab_freelist
+      = VtsID_INVALID;
+   tl_assert(vts_tab);
+}
+
+/* Add ii to the free list, checking that it looks out-of-use. */
+static void add_to_free_list ( VtsID ii )
+{
+   VtsTE* ie = VG_(indexXA)( vts_tab, ii );
+   tl_assert(ie->vts == NULL);
+   tl_assert(ie->rc == 0);
+   tl_assert(ie->freelink == VtsID_INVALID);
+   ie->freelink = vts_tab_freelist;
+   vts_tab_freelist = ii;
+}
+
+/* Get an entry from the free list.  This will return VtsID_INVALID if
+   the free list is empty. */
+static VtsID get_from_free_list ( void )
+{
+   VtsID  ii;
+   VtsTE* ie;
+   if (vts_tab_freelist == VtsID_INVALID)
+      return VtsID_INVALID;
+   ii = vts_tab_freelist;
+   ie = VG_(indexXA)( vts_tab, ii );
+   tl_assert(ie->vts == NULL);
+   tl_assert(ie->rc == 0);
+   vts_tab_freelist = ie->freelink;
+   return ii;
+}
+
+/* Produce a new VtsID that can be used, either by getting it from
+   the freelist, or, if that is empty, by expanding vts_tab. */
+static VtsID get_new_VtsID ( void )
+{
+   VtsID ii;
+   VtsTE te;
+   ii = get_from_free_list();
+   if (ii != VtsID_INVALID)
+      return ii;
+   te.vts = NULL;
+   te.rc = 0;
+   te.freelink = VtsID_INVALID;
+   ii = (VtsID)VG_(addToXA)( vts_tab, &te );
+   return ii;
+}
+
+
+/* Indirect callback from lib_zsm. */
+static void VtsID__rcinc ( VtsID ii )
+{
+   VtsTE* ie;
+   /* VG_(indexXA) does a range check for us */
+   ie = VG_(indexXA)( vts_tab, ii );
+   tl_assert(ie->vts); /* else it's not in use */
+   tl_assert(ie->rc < ~0UL); /* else we can't continue */
+   tl_assert(ie->vts->id == ii);
+   ie->rc++;
+}
+
+/* Indirect callback from lib_zsm. */
+static void VtsID__rcdec ( VtsID ii )
+{
+   VtsTE* ie;
+   /* VG_(indexXA) does a range check for us */
+   ie = VG_(indexXA)( vts_tab, ii );
+   tl_assert(ie->vts); /* else it's not in use */
+   tl_assert(ie->rc > 0); /* else RC snafu */
+   tl_assert(ie->vts->id == ii);
+   ie->rc--;
+}
+
+
+/* Look up 'cand' in our collection of VTSs.  If present, deallocate
+   it and return the VtsID for the pre-existing version.  If not
+   present, add it to both vts_tab and vts_set, allocate a fresh VtsID
+   for it, and return that. */
+static VtsID vts_tab__find_and_dealloc__or_add ( VTS* cand )
+{
+   VTS* auld;
+   tl_assert(cand->id == VtsID_INVALID);
+   auld = vts_set__find_and_dealloc__or_add(cand);
+   if (auld != cand) {
+      /* We already have an Aulde one.  Use that. */
+      VtsTE* ie;
+      tl_assert(auld->id != VtsID_INVALID);
+      ie = VG_(indexXA)( vts_tab, auld->id );
+      tl_assert(ie->vts == auld);
+      return auld->id;
+   } else {
+      VtsID  ii = get_new_VtsID();
+      VtsTE* ie = VG_(indexXA)( vts_tab, ii );
+      ie->vts = cand;
+      ie->rc = 0;
+      ie->freelink = VtsID_INVALID;
+      cand->id = ii;
+      return ii;
+   }
+}
+
+
+static void show_vts_stats ( HChar* caller )
+{
+   UWord nSet, nTab, nLive;
+   ULong totrc;
+   UWord n, i;
+   nSet = VG_(sizeFM)( vts_set );
+   nTab = VG_(sizeXA)( vts_tab );
+   totrc = 0;
+   nLive = 0;
+   n = VG_(sizeXA)( vts_tab );
+   for (i = 0; i < n; i++) {
+      VtsTE* ie = VG_(indexXA)( vts_tab, i );
+      if (ie->vts) {
+         nLive++;
+         totrc += (ULong)ie->rc;
+      } else {
+         tl_assert(ie->rc == 0);
+      }
+   }
+   VG_(printf)("  show_vts_stats %s\n", caller);
+   VG_(printf)("    vts_tab size %4lu\n", nTab);
+   VG_(printf)("    vts_tab live %4lu\n", nLive);
+   VG_(printf)("    vts_set size %4lu\n", nSet);
+   VG_(printf)("        total rc %4llu\n", totrc);
+}
+
+/* NOT TO BE CALLED FROM WITHIN libzsm. */
+static void vts_tab__do_GC ( Bool show_stats )
+{
+   UWord i, nTab, nLive, nFreed;
+
+   /* check this is actually necessary. */
+   tl_assert(vts_tab_freelist == VtsID_INVALID);
+
+   /* empty the caches for partial order checks and binary joins.  We
+      could do better and prune out the entries to be deleted, but it
+      ain't worth the hassle. */
+   VtsID__invalidate_caches();
+
+   /* First, make the reference counts up to date. */
+   zsm_flush_cache();
+
+   nTab = VG_(sizeXA)( vts_tab );
+
+   if (show_stats) {
+      VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
+      show_vts_stats("before GC");
+   }
+
+   /* Now we can inspect the entire vts_tab.  Any entries
+      with zero .rc fields are now no longer in use and can be
+      free list, removed from vts_set, and deleted. */
+   nFreed = 0;
+   for (i = 0; i < nTab; i++) {
+      Bool present;
+      UWord oldK = 0, oldV = 0;
+      VtsTE* te = VG_(indexXA)( vts_tab, i );
+      if (te->vts == NULL) {
+         tl_assert(te->rc == 0);
+         continue; /* already on the free list (presumably) */
+      }
+      if (te->rc > 0)
+         continue; /* in use */
+      /* Ok, we got one we can free. */
+      tl_assert(te->vts->id == i);
+      /* first, remove it from vts_set. */
+      present = VG_(delFromFM)( vts_set,
+                                &oldK, &oldV, (UWord)te->vts );
+      tl_assert(present); /* else it isn't in vts_set ?! */
+      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
+      tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
+      /* now free the VTS itself */
+      VTS__delete(te->vts);
+      te->vts = NULL;
+      /* and finally put this entry on the free list */
+      tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
+      add_to_free_list( i );
+      nFreed++;
+   }
+
+   /* Now figure out when the next GC should be.  We'll allow the
+      number of VTSs to double before GCing again.  Except of course
+      that since we can't (or, at least, don't) shrink vts_tab, we
+      can't set the threshhold value smaller than it. */
+   tl_assert(nFreed <= nTab);
+   nLive = nTab - nFreed;
+   tl_assert(nLive >= 0 && nLive <= nTab);
+   vts_next_GC_at = 2 * nLive;
+   if (vts_next_GC_at < nTab)
+      vts_next_GC_at = nTab;
+
+   if (show_stats) {
+      show_vts_stats("after GC");
+      VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
+   }
+
+   if (1) {
+      static UInt ctr = 0;
+      tl_assert(nTab > 0);
+      VG_(printf)("libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
+                  ctr++, nTab, nLive, (100ULL * nLive) / nTab);
+   }
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// Vts IDs                                             //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+//////////////////////////
+static ULong stats__getOrdering_queries = 0;
+static ULong stats__getOrdering_misses  = 0;
+static ULong stats__join2_queries       = 0;
+static ULong stats__join2_misses        = 0;
+
+static inline UInt ROL32 ( UInt w, Int n ) {
+   w = (w << n) | (w >> (32-n));
+   return w;
+}
+static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
+   UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
+   return hash % nTab;
+}
+
+#define N_GETORDERING_CACHE 1023
+static
+   struct { VtsID vi1; VtsID vi2; POrd ord; }
+   getOrdering_cache[N_GETORDERING_CACHE];
+
+#define N_JOIN2_CACHE 1023
+static
+   struct { VtsID vi1; VtsID vi2; VtsID res; }
+   join2_cache[N_JOIN2_CACHE];
+
+static void VtsID__invalidate_caches ( void ) {
+   Int i;
+   for (i = 0; i < N_GETORDERING_CACHE; i++) {
+      getOrdering_cache[i].vi1 = VtsID_INVALID;
+      getOrdering_cache[i].vi2 = VtsID_INVALID;
+      getOrdering_cache[i].ord = 0; /* an invalid POrd value */
+   }
+   for (i = 0; i < N_JOIN2_CACHE; i++) {
+     join2_cache[i].vi1 = VtsID_INVALID;
+     join2_cache[i].vi2 = VtsID_INVALID;
+     join2_cache[i].res = VtsID_INVALID;
+   }
+}
+//////////////////////////
+
+static Bool VtsID__is_valid ( VtsID vi ) {
+   VtsTE* ve;
+   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
+      return False;
+   ve = VG_(indexXA)( vts_tab, vi );
+   if (!ve->vts)
+      return False;
+   tl_assert(ve->vts->id == vi);
+   return True;
+}
+
+static VTS* VtsID__to_VTS ( VtsID vi ) {
+   VtsTE* te = VG_(indexXA)( vts_tab, vi );
+   tl_assert(te->vts);
+   return te->vts;
+}
+
+static void VtsID__pp ( VtsID vi ) {
+   HChar buf[100];
+   VTS* vts = VtsID__to_VTS(vi);
+   VTS__show( buf, sizeof(buf)-1, vts );
+   buf[sizeof(buf)-1] = 0;
+   VG_(printf)("%s", buf);
+}
+
+/* compute partial ordering relation of vi1 and vi2. */
+__attribute__((noinline))
+static POrd VtsID__getOrdering_WRK ( VtsID vi1, VtsID vi2 ) {
+   UInt hash;
+   POrd ord;
+   VTS  *v1, *v2;
+   //if (vi1 == vi2) return POrd_EQ;
+   tl_assert(vi1 != vi2);
+   ////++
+   stats__getOrdering_queries++;
+   hash = hash_VtsIDs(vi1, vi2, N_GETORDERING_CACHE);
+   if (getOrdering_cache[hash].vi1 == vi1
+       && getOrdering_cache[hash].vi2 == vi2)
+      return getOrdering_cache[hash].ord;
+   stats__getOrdering_misses++;
+   ////--
+   v1  = VtsID__to_VTS(vi1);
+   v2  = VtsID__to_VTS(vi2);
+   ord = VTS__cmp( v1, v2 );
+   ////++
+   getOrdering_cache[hash].vi1 = vi1;
+   getOrdering_cache[hash].vi2 = vi2;
+   getOrdering_cache[hash].ord = ord;
+   ////--
+   return ord;
+}
+static inline POrd VtsID__getOrdering ( VtsID vi1, VtsID vi2 ) {
+   return vi1 == vi2  ? POrd_EQ  : VtsID__getOrdering_WRK(vi1, vi2);
+}
+
+/* compute binary join */
+__attribute__((noinline))
+static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
+   UInt  hash;
+   VtsID res;
+   VTS   *vts1, *vts2, *nyu;
+   //if (vi1 == vi2) return vi1;
+   tl_assert(vi1 != vi2);
+   ////++
+   stats__join2_queries++;
+   hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
+   if (join2_cache[hash].vi1 == vi1
+       && join2_cache[hash].vi2 == vi2)
+      return join2_cache[hash].res;
+   stats__join2_misses++;
+   ////--
+   vts1 = VtsID__to_VTS(vi1);
+   vts2 = VtsID__to_VTS(vi2);
+   nyu  = VTS__join(vts1,vts2);
+   res  = vts_tab__find_and_dealloc__or_add(nyu);
+   ////++
+   join2_cache[hash].vi1 = vi1;
+   join2_cache[hash].vi2 = vi2;
+   join2_cache[hash].res = res;
+   ////--
+   return res;
+}
+static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
+   return vi1 == vi2  ? vi1  : VtsID__join2_WRK(vi1, vi2);
+}
+
+/* create a singleton VTS, namely [thr:1] */
+static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
+   VTS* nyu = VTS__singleton(thr,tym);
+   return vts_tab__find_and_dealloc__or_add(nyu);
+}
+
+/* tick operation, creates value 1 if specified index is absent */
+static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
+   VTS* vts = VtsID__to_VTS(vi);
+   VTS* nyu = VTS__tick(idx,vts);
+   return vts_tab__find_and_dealloc__or_add(nyu);
+}
+
+/* index into a VTS (only for assertions) */
+static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
+   VTS* vts = VtsID__to_VTS(vi);
+   return VTS__indexAt_SLOW( vts, idx );
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// Threads                                             //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+struct _Thr {
+   /* Current VTSs for this thread.  They change as we go along.  viR
+      is the VTS to be used for reads, viW for writes.  Usually they
+      are the same, but can differ when we deal with reader-writer
+      locks.  It is always the case that VtsID__getOrdering(viW,viR)
+      == POrd_LT or POrdEQ -- that is, viW must be the same, or
+      lagging behind, viR. */
+   VtsID viR;
+   VtsID viW;
+   /* opaque (to us) data we hold on behalf of the library's user. */
+   void* opaque;
+};
+
+static Thr* Thr__new ( void ) {
+   Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
+   thr->viR = VtsID_INVALID;
+   thr->viW = VtsID_INVALID;
+   return thr;
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// Shadow Values                                       //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+// type SVal, SVal_INVALID and SVal_NOACCESS are defined by
+// hb_zsm.h.  We have to do everything else here.
+
+/* SVal is 64 bit unsigned int.
+
+      <---------30--------->    <---------30--------->
+   00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
+   01 X--------------------X XX X--------------------X   E(rror)
+   10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
+   11 X--------------------X XX X--------------------X   I: SVal_INVALID
+*/
+#define SVAL_TAGMASK (3ULL << 62)
+
+static inline Bool SVal__isC ( SVal s ) {
+   return (0ULL << 62) == (s & SVAL_TAGMASK);
+}
+static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
+   //tl_assert(VtsID__is_valid(rmini));
+   //tl_assert(VtsID__is_valid(wmini));
+   return (((ULong)rmini) << 32) | ((ULong)wmini);
+}
+static inline VtsID SVal__unC_Rmin ( SVal s ) {
+   tl_assert(SVal__isC(s));
+   return (VtsID)(s >> 32);
+}
+static inline VtsID SVal__unC_Wmin ( SVal s ) {
+   tl_assert(SVal__isC(s));
+   return (VtsID)(s & 0xFFFFFFFFULL);
+}
+
+static Bool SVal__isE ( SVal s ) {
+   return (1ULL << 62) == (s & SVAL_TAGMASK);
+}
+static SVal SVal__mkE ( void ) {
+   return 1ULL << 62;
+}
+
+static Bool SVal__isA ( SVal s ) {
+   return (2ULL << 62) == (s & SVAL_TAGMASK);
+}
+static SVal SVal__mkA ( void ) {
+   return 2ULL << 62;
+}
+
+/* Direct callback from lib_zsm. */
+static void SVal__rcinc ( SVal s ) {
+   if (SVal__isC(s)) {
+      VtsID__rcinc( SVal__unC_Rmin(s) );
+      VtsID__rcinc( SVal__unC_Wmin(s) );
+   }
+}
+
+/* Direct callback from lib_zsm. */
+static void SVal__rcdec ( SVal s ) {
+   if (SVal__isC(s)) {
+      VtsID__rcdec( SVal__unC_Rmin(s) );
+      VtsID__rcdec( SVal__unC_Wmin(s) );
+   }
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// Change-event map2                                   //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+#define EVENT_MAP_GC_AT                (1 * 1000 * 1000)
+#define EVENT_MAP_GC_DISCARD_FRACTION  0.5
+
+/* This is in two parts:
+
+   1. An OSet of RCECs.  This is a set of reference-counted stack
+      traces.  When the reference count of a stack trace becomes zero,
+      it is removed from the set and freed up.  The intent is to have
+      a set of stack traces which can be referred to from (2), but to
+      only represent each one once.  The set is indexed/searched by
+      ordering on the stack trace vectors.
+
+   2. An OSet of OldRefs.  These store information about each old ref
+      that we need to record.  It is indexed by address of the
+      location for which the information is recorded.  For LRU
+      purposes, each OldRef also contains a generation number,
+      indicating when it was most recently accessed.
+
+      The important part of an OldRef is, however, its accs[] array.
+      This is an array of N_OLDREF_ACCS pairs of Thr and a RCEC.  This
+      allows us to collect the last access-traceback by up to
+      N_OLDREF_ACCS different threads for this location.  The accs[]
+      array is a MTF-array.  If a pair falls off the end, that's too
+      bad -- we will lose info about that thread's access to this
+      location.
+
+      When this OSet becomes too big, we can throw away the entries
+      whose generation numbers are below some threshold; hence doing
+      approximate LRU discarding.  For each discarded OldRef we must
+      of course decrement the reference count on the all RCECs it
+      refers to, in order that entries from (1) eventually get
+      discarded too.
+*/
+
+
+static UWord stats__ctxt_rcdec1 = 0;
+static UWord stats__ctxt_rcdec2 = 0;
+static UWord stats__ctxt_rcdec3 = 0;
+static UWord stats__ctxt_rcdec_calls = 0;
+static UWord stats__ctxt_rcdec_discards = 0;
+static UWord stats__ctxt_rcdec1_eq = 0;
+
+static UWord stats__ctxt_tab_curr = 0;
+static UWord stats__ctxt_tab_max  = 0;
+
+static UWord stats__ctxt_tab_qs   = 0;
+static UWord stats__ctxt_tab_cmps = 0;
+
+
+///////////////////////////////////////////////////////
+//// Part (1): An OSet of RCECs
+///
+
+#define N_FRAMES 8
+
+// (UInt) `echo "Reference Counted Execution Context" | md5sum`
+#define RCEC_MAGIC 0xab88abb2UL
+
+//#define N_RCEC_TAB 98317 /* prime */
+#define N_RCEC_TAB 196613 /* prime */
+
+typedef
+   struct _RCEC {
+      struct _RCEC* next;
+      UWord magic;
+      UWord rc;
+      UWord rcX; /* used for crosschecking */
+      UWord frames[1 + N_FRAMES]; /* first word is hash of all the rest */
+   }
+   RCEC;
+
+static RCEC** contextTab = NULL; /* hash table of RCEC*s */
+
+
+/* Gives an arbitrary total order on RCEC .frames fields */
+static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
+   Word i;
+   tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
+   tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
+   if (ec1->frames[0] < ec2->frames[0]) return -1;
+   if (ec1->frames[0] > ec2->frames[0]) return 1;
+   for (i = 1; i < 1 + N_FRAMES; i++) {
+      if (ec1->frames[i] < ec2->frames[i]) return -1;
+      if (ec1->frames[i] > ec2->frames[i]) return 1;
+   }
+   return 0;
+}
+
+
+/* Dec the ref of this RCEC. */
+static void ctxt__rcdec ( RCEC* ec )
+{
+   stats__ctxt_rcdec_calls++;
+   tl_assert(ec && ec->magic == RCEC_MAGIC);
+   tl_assert(ec->rc > 0);
+   ec->rc--;
+}
+
+static void ctxt__rcinc ( RCEC* ec )
+{
+   tl_assert(ec && ec->magic == RCEC_MAGIC);
+   ec->rc++;
+}
+
+
+/* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
+   move it one step closer the the front of the list, so as to make
+   subsequent searches for it cheaper. */
+static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
+{
+   RCEC *ec0, *ec1, *ec2;
+   if (ec == *headp)
+      tl_assert(0); /* already at head of list */
+   tl_assert(ec != NULL);
+   ec0 = *headp;
+   ec1 = NULL;
+   ec2 = NULL;
+   while (True) {
+      if (ec0 == NULL || ec0 == ec) break;
+      ec2 = ec1;
+      ec1 = ec0;
+      ec0 = ec0->next;
+   }
+   tl_assert(ec0 == ec);
+   if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
+      RCEC* tmp;
+      /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
+         predecessor.  Swap ec0 and ec1, that is, move ec0 one step
+         closer to the start of the list. */
+      tl_assert(ec2->next == ec1);
+      tl_assert(ec1->next == ec0);
+      tmp = ec0->next;
+      ec2->next = ec0;
+      ec0->next = ec1;
+      ec1->next = tmp;
+   }
+   else
+   if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
+      /* it's second in the list. */
+      tl_assert(*headp == ec1);
+      tl_assert(ec1->next == ec0);
+      ec1->next = ec0->next;
+      ec0->next = ec1;
+      *headp = ec0;
+   }
+}
+
+
+/* Find the given RCEC in the tree, and return a pointer to it.  Or,
+   if not present, add the given one to the tree (by making a copy of
+   it, so the caller can immediately deallocate the original) and
+   return a pointer to the copy.  The caller can safely have 'example'
+   on its stack, since we will always return a pointer to a copy of
+   it, not to the original.  Note that the inserted node will have .rc
+   of zero and so the caller must immediatly increment it. */
+__attribute__((noinline))
+static RCEC* ctxt__find_or_add ( RCEC* example )
+{
+   UWord hent;
+   RCEC* copy;
+   tl_assert(example && example->magic == RCEC_MAGIC);
+   tl_assert(example->rc == 0);
+
+   /* Search the hash table to see if we already have it. */
+   stats__ctxt_tab_qs++;
+   hent = example->frames[0] % N_RCEC_TAB;
+   copy = contextTab[hent];
+   while (1) {
+      if (!copy) break;
+      tl_assert(copy->magic == RCEC_MAGIC);
+      stats__ctxt_tab_cmps++;
+      if (0 == RCEC__cmp_by_frames(copy, example)) break;
+      copy = copy->next;
+   }
+
+   if (copy) {
+      tl_assert(copy != example);
+      /* optimisation: if it's not at the head of its list, move 1
+         step fwds, to make future searches cheaper */
+      if (copy != contextTab[hent]) {
+         move_RCEC_one_step_forward( &contextTab[hent], copy );
+      }
+   } else {
+      copy = HG_(zalloc)( "libhb.cfoa.1", sizeof(RCEC) );
+      tl_assert(copy != example);
+      *copy = *example;
+      copy->next = contextTab[hent];
+      contextTab[hent] = copy;
+      stats__ctxt_tab_curr++;
+      if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
+         stats__ctxt_tab_max = stats__ctxt_tab_curr;
+   }
+   return copy;
+}
+
+static inline UWord ROLW ( UWord w, Int n )
+{
+   Int bpw = 8 * sizeof(UWord);
+   w = (w << n) | (w >> (bpw-n));
+   return w;
+}
+
+__attribute__((noinline))
+static RCEC* get_RCEC ( Thr* thr )
+{
+   UWord hash, i;
+   RCEC  example;
+   example.magic = RCEC_MAGIC;
+   example.rc = 0;
+   example.rcX = 0;
+   main_get_stacktrace( thr, &example.frames[1], N_FRAMES );
+   hash = 0;
+   for (i = 1; i < 1 + N_FRAMES; i++) {
+      hash ^= example.frames[i];
+      hash = ROLW(hash, 19);
+   }
+   example.frames[0] = hash;
+   return ctxt__find_or_add( &example );
+}
+
+///////////////////////////////////////////////////////
+//// Part (2): An OSet of OldRefs, that refer to (1)
+///
+
+// (UInt) `echo "Old Reference Information" | md5sum`
+#define OldRef_MAGIC 0x30b1f075UL
+
+typedef  struct { Thr* thr; RCEC* rcec; }  Thr_n_RCEC;
+
+#define N_OLDREF_ACCS 3
+
+typedef
+   struct {
+      Addr  ea;
+      UWord magic;
+      UWord gen;    /* when most recently accessed */
+      /* unused slots in this array have .thr == NULL */
+      Thr_n_RCEC accs[N_OLDREF_ACCS];
+   }
+   OldRef;
+
+static Word OldRef__cmp_by_EA ( OldRef* r1, OldRef* r2 ) {
+   tl_assert(r1 && r1->magic == OldRef_MAGIC);
+   tl_assert(r2 && r2->magic == OldRef_MAGIC);
+   if (r1->ea < r2->ea) return -1;
+   if (r1->ea > r2->ea) return 1;
+   return 0;
+}
+
+static OSet* oldrefTree     = NULL; /* OSet* of OldRef */
+static UWord oldrefGen      = 0;    /* current LRU generation # */
+static UWord oldrefTreeN    = 0;    /* # elems in oldrefTree */
+static UWord oldrefGenIncAt = 0;    /* inc gen # when size hits this */
+
+static void event_map_bind ( Addr a, Thr* thr )
+{
+   OldRef key, *ref;
+   RCEC*  here;
+   Word   i, j;
+
+   key.ea    = a;
+   key.magic = OldRef_MAGIC;
+
+   ref = VG_(OSetGen_Lookup)( oldrefTree, &key );
+
+   if (ref) {
+
+      /* We already have a record for this address.  We now need to
+         see if we have a stack trace pertaining to this thread's
+         access. */
+      tl_assert(ref->magic == OldRef_MAGIC);
+
+      tl_assert(thr);
+      for (i = 0; i < N_OLDREF_ACCS; i++) {
+         if (ref->accs[i].thr == thr)
+            break;
+      }
+
+      if (i < N_OLDREF_ACCS) {
+         /* thread 'thr' has an entry at index 'i'.  Update it. */
+         if (i > 0) {
+            Thr_n_RCEC tmp = ref->accs[i-1];
+            ref->accs[i-1] = ref->accs[i];
+            ref->accs[i] = tmp;
+            i--;
+         }
+         here = get_RCEC( thr );
+         if (here == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
+         ctxt__rcinc( here );
+         stats__ctxt_rcdec1++;
+         ctxt__rcdec( ref->accs[i].rcec );
+         ref->accs[i].rcec = here;
+         tl_assert(ref->accs[i].thr == thr);
+      } else {
+         here = get_RCEC( thr );
+         ctxt__rcinc( here );
+         /* No entry for this thread.  Shuffle all of them down one
+            slot, and put the new entry at the start of the array. */
+         if (ref->accs[N_OLDREF_ACCS-1].thr) {
+            /* the last slot is in use.  We must dec the rc on the
+               associated rcec. */
+            tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
+            stats__ctxt_rcdec2++;
+            ctxt__rcdec(ref->accs[N_OLDREF_ACCS-1].rcec);
+         } else {
+            tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
+         }
+         for (j = N_OLDREF_ACCS-1; j >= 1; j--)
+            ref->accs[j] = ref->accs[j-1];
+         ref->accs[0].thr = thr;
+         ref->accs[0].rcec = here;
+         tl_assert(thr); /* thr==NULL is used to signify an empty slot,
+                            so we can't add a NULL thr. */
+      }
+
+      ref->gen = oldrefGen;
+      tl_assert(ref->ea == a);
+
+   } else {
+
+      /* We don't have a record for this address.  Create a new one. */
+      if (oldrefTreeN >= oldrefGenIncAt) {
+         oldrefGen++;
+         oldrefGenIncAt = oldrefTreeN + 50000;
+         if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n",
+                            oldrefGen, oldrefTreeN );
+      }
+      here = get_RCEC( thr );
+      ctxt__rcinc(here);
+      ref = VG_(OSetGen_AllocNode)( oldrefTree, sizeof(OldRef) );
+      ref->magic = OldRef_MAGIC;
+      ref->gen = oldrefGen;
+      ref->ea = a;
+      ref->accs[0].rcec = here;
+      ref->accs[0].thr = thr;
+      tl_assert(thr); /* thr==NULL is used to signify an empty slot,
+                         so we can't add a NULL thr. */
+      for (j = 1; j < N_OLDREF_ACCS; j++) {
+         ref->accs[j].thr = NULL;
+         ref->accs[j].rcec = NULL;
+      }
+      VG_(OSetGen_Insert)( oldrefTree, ref );
+      oldrefTreeN++;
+
+   }
+}
+
+
+static
+Bool event_map_lookup ( /*OUT*/struct _EC** resEC,
+                        /*OUT*/Thr** resThr,
+                        Thr* thr_acc, Addr a )
+{
+  Word   i;
+  OldRef key, *ref;
+
+  tl_assert(thr_acc);
+
+  key.ea = a;
+  key.magic = OldRef_MAGIC;
+
+   ref = VG_(OSetGen_Lookup)( oldrefTree, &key );
+   if (ref) {
+      tl_assert(ref->magic == OldRef_MAGIC);
+      tl_assert(ref->accs[0].thr); /* first slot must always be used */
+
+      for (i = 0; i < N_OLDREF_ACCS; i++) {
+         if (ref->accs[i].thr != NULL
+             && ref->accs[i].thr != thr_acc)
+            break;
+      }
+      /* If we didn't find an entry for some thread other than
+         thr_acc, just return the entry for thread 0.  It'll look
+         pretty stupid to the user though. */
+      if (i == N_OLDREF_ACCS)
+         i = 0;
+
+      tl_assert(i >= 0 && i < N_OLDREF_ACCS);
+      tl_assert(ref->accs[i].thr);
+      tl_assert(ref->accs[i].rcec);
+      tl_assert(ref->accs[i].rcec->magic == RCEC_MAGIC);
+
+      *resEC  = main_stacktrace_to_EC(&ref->accs[i].rcec->frames[1], N_FRAMES);
+      *resThr = ref->accs[i].thr;
+      return True;
+   } else {
+      return False;
+   }
+}
+
+static void event_map_init ( void )
+{
+   Word i;
+   tl_assert(!contextTab);
+   contextTab = HG_(zalloc)( "libhb.event_map_init.1 (context table)",
+                             N_RCEC_TAB * sizeof(RCEC*) );
+   tl_assert(contextTab);
+   for (i = 0; i < N_RCEC_TAB; i++)
+      contextTab[i] = NULL;
+
+   tl_assert(!oldrefTree);
+   tl_assert(offsetof(OldRef,ea) == 0); /* prereq for unboxed cmps */
+   oldrefTree = VG_(OSetGen_Create)(
+                   offsetof(OldRef,ea), /* == 0 */
+                   NULL, /* use unboxed cmp on OldRefs */
+                   HG_(zalloc), "libhb.event_map_init.2 (oldref tree)", 
+                   HG_(free)
+                );
+   tl_assert(oldrefTree);
+
+   oldrefGen = 0;
+   oldrefGenIncAt = 0;
+   oldrefTreeN = 0;
+}
+
+static void event_map__check_reference_counts ( Bool before )
+{
+   RCEC*   rcec;
+   OldRef* oldref;
+   Word    i;
+   UWord   nEnts = 0;
+
+   /* Set the 'check' reference counts to zero.  Also, optionally
+      check that the real reference counts are non-zero.  We allow
+      these to fall to zero before a GC, but the GC must get rid of
+      all those that are zero, hence none should be zero after a
+      GC. */
+   for (i = 0; i < N_RCEC_TAB; i++) {
+      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
+         nEnts++;
+         tl_assert(rcec);
+         tl_assert(rcec->magic == RCEC_MAGIC);
+         if (!before)
+            tl_assert(rcec->rc > 0);
+         rcec->rcX = 0;
+      }
+   }
+
+   /* check that the stats are sane */
+   tl_assert(nEnts == stats__ctxt_tab_curr);
+   tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
+
+   /* visit all the referencing points, inc check ref counts */
+   VG_(OSetGen_ResetIter)( oldrefTree );
+   while ( (oldref = VG_(OSetGen_Next)( oldrefTree )) ) {
+      tl_assert(oldref->magic == OldRef_MAGIC);
+      for (i = 0; i < N_OLDREF_ACCS; i++) {
+         if (oldref->accs[i].thr) {
+            tl_assert(oldref->accs[i].rcec);
+            tl_assert(oldref->accs[i].rcec->magic == RCEC_MAGIC);
+            oldref->accs[i].rcec->rcX++;
+         } else {
+            tl_assert(!oldref->accs[i].rcec);
+         }
+      }
+   }
+
+   /* compare check ref counts with actual */
+   for (i = 0; i < N_RCEC_TAB; i++) {
+      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
+         tl_assert(rcec->rc == rcec->rcX);
+      }
+   }
+}
+
+static void event_map_maybe_GC ( void )
+{
+   OldRef* oldref;
+   UWord   keyW, valW, retained, maxGen;
+   WordFM* genMap;
+   XArray* refs2del;
+   Word    i, j, n2del;
+
+   if (LIKELY(oldrefTreeN < EVENT_MAP_GC_AT))
+      return;
+
+   if (0)
+      VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN);
+
+   /* Check our counting is sane */
+   tl_assert(oldrefTreeN == (UWord) VG_(OSetGen_Size)( oldrefTree ));
+
+   /* Check the reference counts */
+   event_map__check_reference_counts( True/*before*/ );
+
+   /* Compute the distribution of generation values in the ref tree */
+   /* genMap :: generation-number -> count-of-nodes-with-that-number */
+   genMap = VG_(newFM)( HG_(zalloc), "libhb.emmG.1",
+                                      HG_(free), NULL );
+
+   VG_(OSetGen_ResetIter)( oldrefTree );
+   while ( (oldref = VG_(OSetGen_Next)( oldrefTree )) ) {
+      UWord key = oldref->gen;
+      keyW = valW = 0;
+      if (VG_(lookupFM)(genMap, &keyW, &valW, key )) {
+         tl_assert(keyW == key);
+         tl_assert(valW > 0);
+      }
+      /* now valW is the old count for generation 'key' */
+      VG_(addToFM)(genMap, key, valW+1);
+   }
+
+   tl_assert(VG_(sizeFM)(genMap) > 0);
+
+   retained = oldrefTreeN;
+   maxGen = 0;
+   VG_(initIterFM)( genMap );
+   while (VG_(nextIterFM)( genMap, &keyW, &valW )) {
+      tl_assert(keyW > 0); /* can't allow a generation # 0 */
+      if (0) VG_(printf)("  XXX: gen %lu has %lu\n", keyW, valW );
+      tl_assert(keyW >= maxGen);
+      tl_assert(retained >= valW);
+      if (retained - valW
+          > (UWord)(EVENT_MAP_GC_AT * EVENT_MAP_GC_DISCARD_FRACTION)) {
+         retained -= valW;
+         maxGen = keyW;
+      } else {
+         break;
+      }
+   }
+   VG_(doneIterFM)( genMap );
+
+   VG_(printf)(
+      "libhb: EvM GC: delete generations %lu and below, "
+      "retaining %lu entries\n",
+      maxGen, retained );
+
+   VG_(deleteFM)( genMap, NULL, NULL );
+
+   /* If this fails, it means there's only one generation in the
+      entire tree.  So we're kind of in a bad situation, and need to
+      do some stop-gap measure, such as randomly deleting half the
+      entries. */
+   tl_assert(retained < oldrefTreeN);
+
+   /* Now make up a big list of the oldrefTree entries we want to
+      delete.  We can't simultaneously traverse the tree and delete
+      stuff from it, so first we need to copy them off somewhere
+      else. (sigh) */
+   refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.1",
+                          HG_(free), sizeof(OldRef*) );
+
+   VG_(OSetGen_ResetIter)( oldrefTree );
+   while ( (oldref = VG_(OSetGen_Next)( oldrefTree )) ) {
+      tl_assert(oldref->magic == OldRef_MAGIC);
+      if (oldref->gen <= maxGen) {
+         VG_(addToXA)( refs2del, &oldref );
+      }
+   }
+
+   n2del = VG_(sizeXA)( refs2del );
+   tl_assert(n2del == (Word)(oldrefTreeN - retained));
+
+   if (0) VG_(printf)("%s","deleting entries\n");
+   for (i = 0; i < n2del; i++) {
+      void* nd;
+      OldRef* ref = *(OldRef**)VG_(indexXA)( refs2del, i );
+      tl_assert(ref);
+      tl_assert(ref->magic == OldRef_MAGIC);
+      for (j = 0; j < N_OLDREF_ACCS; j++) {
+         if (ref->accs[j].rcec) {
+            tl_assert(ref->accs[j].thr);
+            stats__ctxt_rcdec3++;
+            ctxt__rcdec( ref->accs[j].rcec );
+         } else {
+            tl_assert(!ref->accs[j].thr);
+         }
+      }
+      nd = VG_(OSetGen_Remove)( oldrefTree, ref );
+      VG_(OSetGen_FreeNode)( oldrefTree, nd );
+   }
+
+   VG_(deleteXA)( refs2del );
+
+   tl_assert( VG_(OSetGen_Size)( oldrefTree ) == retained );
+
+   oldrefTreeN = retained;
+   oldrefGenIncAt = oldrefTreeN; /* start new gen right away */
+
+   /* Throw away all RCECs with zero reference counts */
+   for (i = 0; i < N_RCEC_TAB; i++) {
+      RCEC** pp = &contextTab[i];
+      RCEC*  p  = *pp;
+      while (p) {
+         if (p->rc == 0) {
+            *pp = p->next;
+            HG_(free)(p);
+            p = *pp;
+            tl_assert(stats__ctxt_tab_curr > 0);
+            stats__ctxt_tab_curr--;
+         } else {
+            pp = &p->next;
+            p = p->next;
+         }
+      }
+   }
+
+   /* Check the reference counts */
+   event_map__check_reference_counts( False/*after*/ );
+
+   //if (0)
+   //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n",
+   //            VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree));
+
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// Core MSM                                            //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+#define MSM_CONFACC 1
+
+#define MSM_RACE2ERR 1
+
+#define MSM_CHECK 0
+
+static ULong stats__msm_read         = 0;
+static ULong stats__msm_read_change  = 0;
+static ULong stats__msm_write        = 0;
+static ULong stats__msm_write_change = 0;
+
+__attribute__((noinline))
+static void record_race_info ( Thr* acc_thr, 
+                               Addr acc_addr, SizeT szB, Bool isWrite,
+                               SVal svOld, SVal svNew )
+{
+   Bool found;
+   Thr* thrp = NULL;
+   struct _EC* where  = NULL;
+   struct _EC* wherep = NULL;
+   where = main_get_EC( acc_thr );
+   found = event_map_lookup( &wherep, &thrp, acc_thr, acc_addr );
+   if (found) {
+      tl_assert(wherep);
+      tl_assert(thrp);
+      tl_assert(thrp->opaque);
+      tl_assert(acc_thr->opaque);
+      HG_(record_error_Race)( acc_thr->opaque, acc_addr,
+                              isWrite, szB, NULL/*mb_lastlock*/,
+                              wherep, thrp->opaque );
+   } else {
+      tl_assert(!wherep);
+      tl_assert(!thrp);
+      tl_assert(acc_thr->opaque);
+      HG_(record_error_Race)( acc_thr->opaque, acc_addr,
+                              isWrite, szB, NULL/*mb_lastlock*/,
+                              NULL, NULL );
+   }
+}
+
+static Bool is_sane_SVal_C ( SVal sv ) {
+   POrd ord;
+   if (!SVal__isC(sv)) return True;
+   ord = VtsID__getOrdering( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
+   if (ord == POrd_EQ || ord == POrd_LT) return True;
+   return False;
+}
+
+
+/* Compute new state following a read */
+static inline SVal msm_read ( SVal svOld,
+                              /* The following are only needed for 
+                                 creating error reports. */
+                              Thr* acc_thr,
+                              Addr acc_addr, SizeT szB )
+{
+   SVal svNew = SVal_INVALID;
+   stats__msm_read++;
+
+   /* Redundant sanity check on the constraints */
+   if (MSM_CHECK) {
+      tl_assert(is_sane_SVal_C(svOld));
+   }
+
+   if (SVal__isC(svOld)) {
+      POrd  ord;
+      VtsID tviR  = acc_thr->viR;
+      VtsID tviW  = acc_thr->viW;
+      VtsID rmini = SVal__unC_Rmin(svOld);
+      VtsID wmini = SVal__unC_Wmin(svOld);
+
+      ord = VtsID__getOrdering(rmini,tviR);
+      if (ord == POrd_EQ || ord == POrd_LT) {
+         /* no race */
+         /* Note: RWLOCK subtlety: use tviW, not tviR */
+         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
+         goto out;
+      } else {
+         svNew = MSM_RACE2ERR
+                    ? SVal__mkE()
+                    : SVal__mkC( rmini, VtsID__join2(wmini,tviR) );
+         record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
+                           svOld, svNew );
+         goto out;
+      }
+   }
+   if (SVal__isA(svOld)) {
+      /* reading no-access memory (sigh); leave unchanged */
+      /* check for no pollution */
+      tl_assert(svOld == SVal_NOACCESS);
+      svNew = SVal_NOACCESS;
+      goto out;
+   }
+   if (SVal__isE(svOld)) {
+      /* no race, location is already "in error" */
+      svNew = SVal__mkE();
+      goto out;
+   }
+   VG_(printf)("msm_read: bad svOld: 0x%016llx\n", svOld);
+   tl_assert(0);
+
+  out:
+   if (MSM_CHECK) {
+      tl_assert(is_sane_SVal_C(svNew));
+   }
+   tl_assert(svNew != SVal_INVALID);
+   if (svNew != svOld) {
+      if (MSM_CONFACC && SVal__isC(svOld) && SVal__isC(svNew)) {
+         event_map_bind( acc_addr, acc_thr );
+         stats__msm_read_change++;
+      }
+   }
+   return svNew;
+}
+
+
+/* Compute new state following a write */
+static inline SVal msm_write ( SVal svOld,
+                              /* The following are only needed for 
+                                 creating error reports. */
+                              Thr* acc_thr,
+                              Addr acc_addr, SizeT szB )
+{
+   SVal svNew = SVal_INVALID;
+   stats__msm_write++;
+
+   /* Redundant sanity check on the constraints */
+   if (MSM_CHECK) {
+      tl_assert(is_sane_SVal_C(svOld));
+   }
+
+   if (SVal__isC(svOld)) {
+      POrd  ord;
+      VtsID tviW  = acc_thr->viW;
+      VtsID wmini = SVal__unC_Wmin(svOld);
+
+      ord = VtsID__getOrdering(wmini,tviW);
+      if (ord == POrd_EQ || ord == POrd_LT) {
+         /* no race */
+         svNew = SVal__mkC( tviW, tviW );
+         goto out;
+      } else {
+         VtsID rmini = SVal__unC_Rmin(svOld);
+         svNew = MSM_RACE2ERR
+                    ? SVal__mkE()
+                    : SVal__mkC( VtsID__join2(rmini,tviW),
+                                 VtsID__join2(wmini,tviW) );
+         record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
+                           svOld, svNew );
+         goto out;
+      }
+   }
+   if (SVal__isA(svOld)) {
+      /* writing no-access memory (sigh); leave unchanged */
+      /* check for no pollution */
+      tl_assert(svOld == SVal_NOACCESS);
+      svNew = SVal_NOACCESS;
+      goto out;
+   }
+   if (SVal__isE(svOld)) {
+      /* no race, location is already "in error" */
+      svNew = SVal__mkE();
+      goto out;
+   }
+   VG_(printf)("msm_write: bad svOld: 0x%016llx\n", svOld);
+   tl_assert(0);
+
+  out:
+   if (MSM_CHECK) {
+      tl_assert(is_sane_SVal_C(svNew));
+   }
+   tl_assert(svNew != SVal_INVALID);
+   if (svNew != svOld) {
+      if (MSM_CONFACC && SVal__isC(svOld) && SVal__isC(svNew)) {
+         event_map_bind( acc_addr, acc_thr );
+         stats__msm_write_change++;
+      }
+   }
+   return svNew;
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// Apply core MSM to specific memory locations         //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+/*------------- ZSM accesses: 8 bit apply ------------- */
+
+void zsm_apply8___msm_read ( Thr* thr, Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   SVal       svOld, svNew;
+   UShort     descr;
+   stats__cline_read8s++;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0 .. 7 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
+      SVal* tree = &cl->svals[tno << 3];
+      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   }
+   svOld = cl->svals[cloff];
+   svNew = msm_read( svOld, thr,a,1 );
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+}
+
+void zsm_apply8___msm_write ( Thr* thr, Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   SVal       svOld, svNew;
+   UShort     descr;
+   stats__cline_read8s++;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0 .. 7 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
+      SVal* tree = &cl->svals[tno << 3];
+      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   }
+   svOld = cl->svals[cloff];
+   svNew = msm_write( svOld, thr,a,1 );
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+}
+
+/*------------- ZSM accesses: 16 bit apply ------------- */
+
+void zsm_apply16___msm_read ( Thr* thr, Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   SVal       svOld, svNew;
+   UShort     descr;
+   stats__cline_read16s++;
+   if (UNLIKELY(!aligned16(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
+      if (valid_value_is_below_me_16(descr, toff)) {
+         goto slowcase;
+      } else {
+         SVal* tree = &cl->svals[tno << 3];
+         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
+      }
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   }
+   svOld = cl->svals[cloff];
+   svNew = msm_read( svOld, thr,a,2 );
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+   return;
+  slowcase: /* misaligned, or must go further down the tree */
+   stats__cline_16to8splits++;
+   zsm_apply8___msm_read( thr, a + 0 );
+   zsm_apply8___msm_read( thr, a + 1 );
+}
+
+void zsm_apply16___msm_write ( Thr* thr, Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   SVal       svOld, svNew;
+   UShort     descr;
+   stats__cline_read16s++;
+   if (UNLIKELY(!aligned16(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
+      if (valid_value_is_below_me_16(descr, toff)) {
+         goto slowcase;
+      } else {
+         SVal* tree = &cl->svals[tno << 3];
+         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
+      }
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   }
+   svOld = cl->svals[cloff];
+   svNew = msm_write( svOld, thr,a,2 );
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+   return;
+  slowcase: /* misaligned, or must go further down the tree */
+   stats__cline_16to8splits++;
+   zsm_apply8___msm_write( thr, a + 0 );
+   zsm_apply8___msm_write( thr, a + 1 );
+}
+
+/*------------- ZSM accesses: 32 bit apply ------------- */
+
+void zsm_apply32___msm_read ( Thr* thr, Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   SVal       svOld, svNew;
+   UShort     descr;
+   if (UNLIKELY(!aligned32(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0 or 4 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
+      if (valid_value_is_above_me_32(descr, toff)) {
+         SVal* tree = &cl->svals[tno << 3];
+         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
+      } else {
+         goto slowcase;
+      }
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   }
+   svOld = cl->svals[cloff];
+   svNew = msm_read( svOld, thr,a,4 );
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+   return;
+  slowcase: /* misaligned, or must go further down the tree */
+   stats__cline_32to16splits++;
+   zsm_apply16___msm_read( thr, a + 0 );
+   zsm_apply16___msm_read( thr, a + 2 );
+}
+
+void zsm_apply32___msm_write ( Thr* thr, Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   SVal       svOld, svNew;
+   UShort     descr;
+   if (UNLIKELY(!aligned32(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0 or 4 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
+      if (valid_value_is_above_me_32(descr, toff)) {
+         SVal* tree = &cl->svals[tno << 3];
+         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
+      } else {
+         goto slowcase;
+      }
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   }
+   svOld = cl->svals[cloff];
+   svNew = msm_write( svOld, thr,a,4 );
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+   return;
+  slowcase: /* misaligned, or must go further down the tree */
+   stats__cline_32to16splits++;
+   zsm_apply16___msm_write( thr, a + 0 );
+   zsm_apply16___msm_write( thr, a + 2 );
+}
+
+/*------------- ZSM accesses: 64 bit apply ------------- */
+
+void zsm_apply64___msm_read ( Thr* thr, Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   SVal       svOld, svNew;
+   UShort     descr;
+   stats__cline_read64s++;
+   if (UNLIKELY(!aligned64(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0, unused */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
+      goto slowcase;
+   }
+   svOld = cl->svals[cloff];
+   svNew = msm_read( svOld, thr,a,8 );
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+   return;
+  slowcase: /* misaligned, or must go further down the tree */
+   stats__cline_64to32splits++;
+   zsm_apply32___msm_read( thr, a + 0 );
+   zsm_apply32___msm_read( thr, a + 4 );
+}
+
+void zsm_apply64___msm_write ( Thr* thr, Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   SVal       svOld, svNew;
+   UShort     descr;
+   stats__cline_read64s++;
+   if (UNLIKELY(!aligned64(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0, unused */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
+      goto slowcase;
+   }
+   svOld = cl->svals[cloff];
+   svNew = msm_write( svOld, thr,a,8 );
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+   return;
+  slowcase: /* misaligned, or must go further down the tree */
+   stats__cline_64to32splits++;
+   zsm_apply32___msm_write( thr, a + 0 );
+   zsm_apply32___msm_write( thr, a + 4 );
+}
+
+/*--------------- ZSM accesses: 8 bit write --------------- */
+
+static
+void zsm_write8 ( Addr a, SVal svNew ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   UShort     descr;
+   stats__cline_set8s++;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0 .. 7 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
+      SVal* tree = &cl->svals[tno << 3];
+      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+   }
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff] = svNew;
+}
+
+/*--------------- ZSM accesses: 16 bit write --------------- */
+
+static
+void zsm_write16 ( Addr a, SVal svNew ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   UShort     descr;
+   stats__cline_set16s++;
+   if (UNLIKELY(!aligned16(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
+      if (valid_value_is_below_me_16(descr, toff)) {
+         /* Writing at this level.  Need to fix up 'descr'. */
+         cl->descrs[tno] = pullup_descr_to_16(descr, toff);
+         /* At this point, the tree does not match cl->descr[tno] any
+            more.  The assignments below will fix it up. */
+      } else {
+         /* We can't indiscriminately write on the w16 node as in the
+            w64 case, as that might make the node inconsistent with
+            its parent.  So first, pull down to this level. */
+         SVal* tree = &cl->svals[tno << 3];
+         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
+      if (SCE_CACHELINE)
+         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+      }
+   }
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff + 0] = svNew;
+   cl->svals[cloff + 1] = SVal_INVALID;
+   return;
+  slowcase: /* misaligned */
+   stats__cline_16to8splits++;
+   zsm_write8( a + 0, svNew );
+   zsm_write8( a + 1, svNew );
+}
+
+/*--------------- ZSM accesses: 32 bit write --------------- */
+
+static
+void zsm_write32 ( Addr a, SVal svNew ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   UShort     descr;
+   stats__cline_set32s++;
+   if (UNLIKELY(!aligned32(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0 or 4 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
+      if (valid_value_is_above_me_32(descr, toff)) {
+         /* We can't indiscriminately write on the w32 node as in the
+            w64 case, as that might make the node inconsistent with
+            its parent.  So first, pull down to this level. */
+         SVal* tree = &cl->svals[tno << 3];
+         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
+         if (SCE_CACHELINE)
+            tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
+      } else {
+         /* Writing at this level.  Need to fix up 'descr'. */
+         cl->descrs[tno] = pullup_descr_to_32(descr, toff);
+         /* At this point, the tree does not match cl->descr[tno] any
+            more.  The assignments below will fix it up. */
+      }
+   }
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff + 0] = svNew;
+   cl->svals[cloff + 1] = SVal_INVALID;
+   cl->svals[cloff + 2] = SVal_INVALID;
+   cl->svals[cloff + 3] = SVal_INVALID;
+   return;
+  slowcase: /* misaligned */
+   stats__cline_32to16splits++;
+   zsm_write16( a + 0, svNew );
+   zsm_write16( a + 2, svNew );
+}
+
+/*--------------- ZSM accesses: 64 bit write --------------- */
+
+static
+void zsm_write64 ( Addr a, SVal svNew ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   stats__cline_set64s++;
+   if (UNLIKELY(!aligned64(a))) goto slowcase;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0 */
+   cl->descrs[tno] = TREE_DESCR_64;
+   tl_assert(svNew != SVal_INVALID);
+   cl->svals[cloff + 0] = svNew;
+   cl->svals[cloff + 1] = SVal_INVALID;
+   cl->svals[cloff + 2] = SVal_INVALID;
+   cl->svals[cloff + 3] = SVal_INVALID;
+   cl->svals[cloff + 4] = SVal_INVALID;
+   cl->svals[cloff + 5] = SVal_INVALID;
+   cl->svals[cloff + 6] = SVal_INVALID;
+   cl->svals[cloff + 7] = SVal_INVALID;
+   return;
+  slowcase: /* misaligned */
+   stats__cline_64to32splits++;
+   zsm_write32( a + 0, svNew );
+   zsm_write32( a + 4, svNew );
+}
+
+/*------------- ZSM accesses: 8 bit read/copy ------------- */
+
+static
+SVal zsm_read8 ( Addr a ) {
+   CacheLine* cl; 
+   UWord      cloff, tno, toff;
+   UShort     descr;
+   stats__cline_get8s++;
+   cl    = get_cacheline(a);
+   cloff = get_cacheline_offset(a);
+   tno   = get_treeno(a);
+   toff  = get_tree_offset(a); /* == 0 .. 7 */
+   descr = cl->descrs[tno];
+   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
+      SVal* tree = &cl->svals[tno << 3];
+      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
+   }
+   return cl->svals[cloff];
+}
+
+static void zsm_copy8 ( Addr src, Addr dst, Bool uu_normalise ) {
+   SVal       sv;
+   stats__cline_copy8s++;
+   sv = zsm_read8( src );
+   zsm_write8( dst, sv );
+}
+
+/* ------------ Shadow memory range setting ops ------------ */
+
+void zsm_apply_range___msm_read ( Thr* thr, 
+                                  Addr a, SizeT len )
+{
+   /* fast track a couple of common cases */
+   if (len == 4 && aligned32(a)) {
+      zsm_apply32___msm_read( thr, a );
+      return;
+   }
+   if (len == 8 && aligned64(a)) {
+      zsm_apply64___msm_read( thr, a );
+      return;
+   }
+
+   /* be completely general (but as efficient as possible) */
+   if (len == 0) return;
+
+   if (!aligned16(a) && len >= 1) {
+      zsm_apply8___msm_read( thr, a );
+      a += 1;
+      len -= 1;
+      tl_assert(aligned16(a));
+   }
+   if (len == 0) return;
+
+   if (!aligned32(a) && len >= 2) {
+      zsm_apply16___msm_read( thr, a );
+      a += 2;
+      len -= 2;
+      tl_assert(aligned32(a));
+   }
+   if (len == 0) return;
+
+   if (!aligned64(a) && len >= 4) {
+      zsm_apply32___msm_read( thr, a );
+      a += 4;
+      len -= 4;
+      tl_assert(aligned64(a));
+   }
+   if (len == 0) return;
+
+   if (len >= 8) {
+      tl_assert(aligned64(a));
+      while (len >= 8) {
+         zsm_apply64___msm_read( thr, a );
+         a += 8;
+         len -= 8;
+      }
+      tl_assert(aligned64(a));
+   }
+   if (len == 0) return;
+
+   if (len >= 4)
+      tl_assert(aligned32(a));
+   if (len >= 4) {
+      zsm_apply32___msm_read( thr, a );
+      a += 4;
+      len -= 4;
+   }
+   if (len == 0) return;
+
+   if (len >= 2)
+      tl_assert(aligned16(a));
+   if (len >= 2) {
+      zsm_apply16___msm_read( thr, a );
+      a += 2;
+      len -= 2;
+   }
+   if (len == 0) return;
+
+   if (len >= 1) {
+      zsm_apply8___msm_read( thr, a );
+      a += 1;
+      len -= 1;
+   }
+   tl_assert(len == 0);
+}
+
+
+
+void zsm_apply_range___msm_write ( Thr* thr,
+                                   Addr a, SizeT len )
+{
+   /* fast track a couple of common cases */
+   if (len == 4 && aligned32(a)) {
+      zsm_apply32___msm_write( thr, a );
+      return;
+   }
+   if (len == 8 && aligned64(a)) {
+      zsm_apply64___msm_write( thr, a );
+      return;
+   }
+
+   /* be completely general (but as efficient as possible) */
+   if (len == 0) return;
+
+   if (!aligned16(a) && len >= 1) {
+      zsm_apply8___msm_write( thr, a );
+      a += 1;
+      len -= 1;
+      tl_assert(aligned16(a));
+   }
+   if (len == 0) return;
+
+   if (!aligned32(a) && len >= 2) {
+      zsm_apply16___msm_write( thr, a );
+      a += 2;
+      len -= 2;
+      tl_assert(aligned32(a));
+   }
+   if (len == 0) return;
+
+   if (!aligned64(a) && len >= 4) {
+      zsm_apply32___msm_write( thr, a );
+      a += 4;
+      len -= 4;
+      tl_assert(aligned64(a));
+   }
+   if (len == 0) return;
+
+   if (len >= 8) {
+      tl_assert(aligned64(a));
+      while (len >= 8) {
+         zsm_apply64___msm_write( thr, a );
+         a += 8;
+         len -= 8;
+      }
+      tl_assert(aligned64(a));
+   }
+   if (len == 0) return;
+
+   if (len >= 4)
+      tl_assert(aligned32(a));
+   if (len >= 4) {
+      zsm_apply32___msm_write( thr, a );
+      a += 4;
+      len -= 4;
+   }
+   if (len == 0) return;
+
+   if (len >= 2)
+      tl_assert(aligned16(a));
+   if (len >= 2) {
+      zsm_apply16___msm_write( thr, a );
+      a += 2;
+      len -= 2;
+   }
+   if (len == 0) return;
+
+   if (len >= 1) {
+      zsm_apply8___msm_write( thr, a );
+      a += 1;
+      len -= 1;
+   }
+   tl_assert(len == 0);
+}
+
+
+
+
+/* Block-copy states (needed for implementing realloc()). */
+
+static void zsm_copy_range ( Addr src, Addr dst, SizeT len )
+{
+   SizeT i;
+   if (len == 0)
+      return;
+
+   /* assert for non-overlappingness */
+   tl_assert(src+len <= dst || dst+len <= src);
+
+   /* To be simple, just copy byte by byte.  But so as not to wreck
+      performance for later accesses to dst[0 .. len-1], normalise
+      destination lines as we finish with them, and also normalise the
+      line containing the first and last address. */
+   for (i = 0; i < len; i++) {
+      Bool normalise
+         = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
+           || i == 0       /* first in range */
+           || i == len-1;  /* last in range */
+      zsm_copy8( src+i, dst+i, normalise );
+   }
+}
+
+
+/* For setting address ranges to a given value.  Has considerable
+   sophistication so as to avoid generating large numbers of pointless
+   cache loads/writebacks for large ranges. */
+
+/* Do small ranges in-cache, in the obvious way. */
+static
+void zsm_set_range_SMALL ( Addr a, SizeT len, SVal svNew )
+{
+   /* fast track a couple of common cases */
+   if (len == 4 && aligned32(a)) {
+      zsm_write32( a, svNew );
+      return;
+   }
+   if (len == 8 && aligned64(a)) {
+      zsm_write64( a, svNew );
+      return;
+   }
+
+   /* be completely general (but as efficient as possible) */
+   if (len == 0) return;
+
+   if (!aligned16(a) && len >= 1) {
+      zsm_write8( a, svNew );
+      a += 1;
+      len -= 1;
+      tl_assert(aligned16(a));
+   }
+   if (len == 0) return;
+
+   if (!aligned32(a) && len >= 2) {
+      zsm_write16( a, svNew );
+      a += 2;
+      len -= 2;
+      tl_assert(aligned32(a));
+   }
+   if (len == 0) return;
+
+   if (!aligned64(a) && len >= 4) {
+      zsm_write32( a, svNew );
+      a += 4;
+      len -= 4;
+      tl_assert(aligned64(a));
+   }
+   if (len == 0) return;
+
+   if (len >= 8) {
+      tl_assert(aligned64(a));
+      while (len >= 8) {
+         zsm_write64( a, svNew );
+         a += 8;
+         len -= 8;
+      }
+      tl_assert(aligned64(a));
+   }
+   if (len == 0) return;
+
+   if (len >= 4)
+      tl_assert(aligned32(a));
+   if (len >= 4) {
+      zsm_write32( a, svNew );
+      a += 4;
+      len -= 4;
+   }
+   if (len == 0) return;
+
+   if (len >= 2)
+      tl_assert(aligned16(a));
+   if (len >= 2) {
+      zsm_write16( a, svNew );
+      a += 2;
+      len -= 2;
+   }
+   if (len == 0) return;
+
+   if (len >= 1) {
+      zsm_write8( a, svNew );
+      a += 1;
+      len -= 1;
+   }
+   tl_assert(len == 0);
+}
+
+
+/* If we're doing a small range, hand off to zsm_set_range_SMALL.  But
+   for larger ranges, try to operate directly on the out-of-cache
+   representation, rather than dragging lines into the cache,
+   overwriting them, and forcing them out.  This turns out to be an
+   important performance optimisation. */
+
+static void zsm_set_range ( Addr a, SizeT len, SVal svNew )
+{
+   tl_assert(svNew != SVal_INVALID);
+   stats__cache_make_New_arange += (ULong)len;
+
+   if (0 && len > 500)
+      VG_(printf)("make New      ( %#lx, %ld )\n", a, len );
+
+   if (0) {
+      static UWord n_New_in_cache = 0;
+      static UWord n_New_not_in_cache = 0;
+      /* tag is 'a' with the in-line offset masked out, 
+         eg a[31]..a[4] 0000 */
+      Addr       tag = a & ~(N_LINE_ARANGE - 1);
+      UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
+      if (LIKELY(tag == cache_shmem.tags0[wix])) {
+         n_New_in_cache++;
+      } else {
+         n_New_not_in_cache++;
+      }
+      if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
+         VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
+                     n_New_in_cache, n_New_not_in_cache );
+   }
+
+   if (LIKELY(len < 2 * N_LINE_ARANGE)) {
+      zsm_set_range_SMALL( a, len, svNew );
+   } else {
+      Addr  before_start  = a;
+      Addr  aligned_start = cacheline_ROUNDUP(a);
+      Addr  after_start   = cacheline_ROUNDDN(a + len);
+      UWord before_len    = aligned_start - before_start;
+      UWord aligned_len   = after_start - aligned_start;
+      UWord after_len     = a + len - after_start;
+      tl_assert(before_start <= aligned_start);
+      tl_assert(aligned_start <= after_start);
+      tl_assert(before_len < N_LINE_ARANGE);
+      tl_assert(after_len < N_LINE_ARANGE);
+      tl_assert(get_cacheline_offset(aligned_start) == 0);
+      if (get_cacheline_offset(a) == 0) {
+         tl_assert(before_len == 0);
+         tl_assert(a == aligned_start);
+      }
+      if (get_cacheline_offset(a+len) == 0) {
+         tl_assert(after_len == 0);
+         tl_assert(after_start == a+len);
+      }
+      if (before_len > 0) {
+         zsm_set_range_SMALL( before_start, before_len, svNew );
+      }
+      if (after_len > 0) {
+         zsm_set_range_SMALL( after_start, after_len, svNew );
+      }
+      stats__cache_make_New_inZrep += (ULong)aligned_len;
+
+      while (1) {
+         Addr tag;
+         UWord wix;
+         if (aligned_start >= after_start)
+            break;
+         tl_assert(get_cacheline_offset(aligned_start) == 0);
+         tag = aligned_start & ~(N_LINE_ARANGE - 1);
+         wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
+         if (tag == cache_shmem.tags0[wix]) {
+            UWord i;
+            for (i = 0; i < N_LINE_ARANGE / 8; i++)
+               zsm_write64( aligned_start + i * 8, svNew );
+         } else {
+            UWord i;
+            Word zix;
+            SecMap* sm;
+            LineZ* lineZ;
+            /* This line is not in the cache.  Do not force it in; instead
+               modify it in-place. */
+            /* find the Z line to write in and rcdec it or the
+               associated F line. */
+            find_Z_for_writing( &sm, &zix, tag );
+            tl_assert(sm);
+            tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
+            lineZ = &sm->linesZ[zix];
+            lineZ->dict[0] = svNew;
+            lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
+            for (i = 0; i < N_LINE_ARANGE/4; i++)
+               lineZ->ix2s[i] = 0; /* all refer to dict[0] */
+            rcinc_LineZ(lineZ);
+         }
+         aligned_start += N_LINE_ARANGE;
+         aligned_len -= N_LINE_ARANGE;
+      }
+      tl_assert(aligned_start == after_start);
+      tl_assert(aligned_len == 0);
+   }
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// Synchronisation objects                             //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+// (UInt) `echo "Synchronisation object" | md5sum`
+#define SO_MAGIC 0x56b3c5b0U
+
+struct _SO {
+   VtsID viR; /* r-clock of sender */
+   VtsID viW; /* w-clock of sender */
+   UInt  magic;
+};
+
+static SO* SO__Alloc ( void ) {
+   SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
+   so->viR   = VtsID_INVALID;
+   so->viW   = VtsID_INVALID;
+   so->magic = SO_MAGIC;
+   return so;
+}
+static void SO__Dealloc ( SO* so ) {
+   tl_assert(so);
+   tl_assert(so->magic == SO_MAGIC);
+   if (so->viR == VtsID_INVALID) {
+      tl_assert(so->viW == VtsID_INVALID);
+   } else {
+      tl_assert(so->viW != VtsID_INVALID);
+      VtsID__rcdec(so->viR);
+      VtsID__rcdec(so->viW);
+   }
+   so->magic = 0;
+   HG_(free)( so );
+}
+
+
+/////////////////////////////////////////////////////////
+//                                                     //
+// Top Level API                                       //
+//                                                     //
+/////////////////////////////////////////////////////////
+
+static void show_thread_state ( HChar* str, Thr* t ) 
+{
+   if (1) return;
+   if (t->viR == t->viW) {
+      VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
+      VtsID__pp( t->viR );
+      VG_(printf)("%s","\n");
+   } else {
+      VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
+      VtsID__pp( t->viR );
+      VG_(printf)(" viW %u==", t->viW);
+      VtsID__pp( t->viW );
+      VG_(printf)("%s","\n");
+   }
+}
+
+
+Thr* libhb_init (
+        void        (*get_stacktrace)( Thr*, Addr*, UWord ),
+        struct _EC* (*stacktrace_to_EC)( Addr*, UWord ),
+        struct _EC* (*get_EC)( Thr* )
+     )
+{
+   Thr*  thr;
+   VtsID vi;
+   tl_assert(get_stacktrace);
+   tl_assert(stacktrace_to_EC);
+   tl_assert(get_EC);
+   main_get_stacktrace   = get_stacktrace;
+   main_stacktrace_to_EC = stacktrace_to_EC;
+   main_get_EC           = get_EC;
+
+   // No need to initialise hg_wordfm.
+   // No need to initialise hg_wordset.
+
+   vts_set_init();
+   vts_tab_init();
+   event_map_init();
+   VtsID__invalidate_caches();
+
+   // initialise shadow memory
+   zsm_init( SVal__rcinc, SVal__rcdec );
+
+   thr = Thr__new();
+   vi  = VtsID__mk_Singleton( thr, 1 );
+   thr->viR = vi;
+   thr->viW = vi;
+   VtsID__rcinc(thr->viR);
+   VtsID__rcinc(thr->viW);
+
+   show_thread_state("  root", thr);
+   return thr;
+}
+
+Thr* libhb_create ( Thr* parent )
+{
+   /* The child's VTSs are copies of the parent's VTSs, but ticked at
+      the child's index.  Since the child's index is guaranteed
+      unique, it has never been seen before, so the implicit value
+      before the tick is zero and after that is one. */
+   Thr* child = Thr__new();
+
+   child->viR = VtsID__tick( parent->viR, child );
+   child->viW = VtsID__tick( parent->viW, child );
+   VtsID__rcinc(child->viR);
+   VtsID__rcinc(child->viW);
+
+   tl_assert(VtsID__indexAt( child->viR, child ) == 1);
+   tl_assert(VtsID__indexAt( child->viW, child ) == 1);
+
+   /* and the parent has to move along too */
+   VtsID__rcdec(parent->viR);
+   VtsID__rcdec(parent->viW);
+   parent->viR = VtsID__tick( parent->viR, parent );
+   parent->viW = VtsID__tick( parent->viW, parent );
+   VtsID__rcinc(parent->viR);
+   VtsID__rcinc(parent->viW);
+
+   show_thread_state(" child", child);
+   show_thread_state("parent", parent);
+
+   return child;
+}
+
+/* Shut down the library, and print stats (in fact that's _all_
+   this is for. */
+void libhb_shutdown ( Bool show_stats )
+{
+   if (show_stats) {
+      VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
+      VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
+                  stats__secmaps_allocd,
+                  stats__secmap_ga_space_covered);
+      VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
+                  stats__secmap_linesZ_allocd,
+                  stats__secmap_linesZ_bytes);
+      VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)\n",
+                  stats__secmap_linesF_allocd,
+                  stats__secmap_linesF_bytes);
+      VG_(printf)(" secmaps: %'10lu iterator steppings\n",
+                  stats__secmap_iterator_steppings);
+      VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
+                  stats__secmaps_search, stats__secmaps_search_slow);
+
+      VG_(printf)("%s","\n");
+      VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
+                  stats__cache_totrefs, stats__cache_totmisses );
+      VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
+                  stats__cache_Z_fetches, stats__cache_F_fetches );
+      VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
+                  stats__cache_Z_wbacks, stats__cache_F_wbacks );
+      VG_(printf)("   cache: %'14lu invals,     %'14lu flushes\n",
+                  stats__cache_invals, stats__cache_flushes );
+      VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
+                  stats__cache_make_New_arange,
+                  stats__cache_make_New_inZrep);
+
+      VG_(printf)("%s","\n");
+      VG_(printf)("   cline: %'10lu normalises\n",
+                  stats__cline_normalises );
+      VG_(printf)("   cline:  rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
+                  stats__cline_read64s,
+                  stats__cline_read32s,
+                  stats__cline_read16s,
+                  stats__cline_read8s );
+      VG_(printf)("   cline:  wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
+                  stats__cline_write64s,
+                  stats__cline_write32s,
+                  stats__cline_write16s,
+                  stats__cline_write8s );
+      VG_(printf)("   cline: sets 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
+                  stats__cline_set64s,
+                  stats__cline_set32s,
+                  stats__cline_set16s,
+                  stats__cline_set8s );
+      VG_(printf)("   cline: get1s %'lu, copy1s %'lu\n",
+                  stats__cline_get8s, stats__cline_copy8s );
+      VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
+                 stats__cline_64to32splits,
+                 stats__cline_32to16splits,
+                 stats__cline_16to8splits );
+      VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
+                 stats__cline_64to32pulldown,
+                 stats__cline_32to16pulldown,
+                 stats__cline_16to8pulldown );
+      if (0)
+      VG_(printf)("   cline: sizeof(CacheLineZ) %ld, covers %ld bytes of arange\n",
+                  (Word)sizeof(LineZ), (Word)N_LINE_ARANGE);
+
+      VG_(printf)("%s","\n");
+
+      VG_(printf)("   libhb: %'13llu msm_read  (%'llu changed)\n",
+                  stats__msm_read, stats__msm_read_change);
+      VG_(printf)("   libhb: %'13llu msm_write (%'llu changed)\n",
+                  stats__msm_write, stats__msm_write_change);
+      VG_(printf)("   libhb: %'13llu getOrd queries (%'llu misses)\n",
+                  stats__getOrdering_queries, stats__getOrdering_misses);
+      VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
+                  stats__join2_queries, stats__join2_misses);
+
+      VG_(printf)("%s","\n");
+      VG_(printf)(
+         "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
+         VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
+      );
+      VG_(printf)( "   libhb: %lu entries in vts_set\n",
+                   VG_(sizeFM)( vts_set ) );
+
+      VG_(printf)("%s","\n");
+      VG_(printf)( "   libhb: ctxt__rcdec: 1=%lu(%lu eq), 2=%lu, 3=%lu\n",
+                   stats__ctxt_rcdec1, stats__ctxt_rcdec1_eq,
+                   stats__ctxt_rcdec2,
+                   stats__ctxt_rcdec3 );
+      VG_(printf)( "   libhb: ctxt__rcdec: calls %lu, discards %lu\n",
+                   stats__ctxt_rcdec_calls, stats__ctxt_rcdec_discards);
+      VG_(printf)( "   libhb: contextTab: %lu slots, %lu max ents\n",
+                   (UWord)N_RCEC_TAB,
+                   stats__ctxt_tab_curr );
+      VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
+                   stats__ctxt_tab_qs,
+                   stats__ctxt_tab_cmps );
+#if 0
+      VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
+      VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
+      VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
+      VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
+      VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
+      VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
+      VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
+      VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
+      VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
+      VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
+      VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
+      VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
+      VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
+      VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));
+
+      VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
+      VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
+      VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
+      VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
+#endif
+
+      VG_(printf)("%s","<<< END libhb stats >>>\n");
+      VG_(printf)("%s","\n");
+
+   }
+}
+
+void libhb_async_exit ( Thr* thr )
+{
+   /* is there anything we need to do? */
+}
+
+/* Both Segs and SOs point to VTSs.  However, there is no sharing, so
+   a Seg that points at a VTS is its one-and-only owner, and ditto for
+   a SO that points at a VTS. */
+
+SO* libhb_so_alloc ( void )
+{
+   return SO__Alloc();
+}
+
+void libhb_so_dealloc ( SO* so )
+{
+   tl_assert(so);
+   tl_assert(so->magic == SO_MAGIC);
+   SO__Dealloc(so);
+}
+
+/* See comments in libhb.h for details on the meaning of 
+   strong vs weak sends and strong vs weak receives. */
+void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
+{
+   /* Copy the VTSs from 'thr' into the sync object, and then move
+      the thread along one step. */
+
+   tl_assert(so);
+   tl_assert(so->magic == SO_MAGIC);
+
+   /* stay sane .. a thread's read-clock must always lead or be the
+      same as its write-clock */
+   { POrd ord = VtsID__getOrdering(thr->viW, thr->viR);
+     tl_assert(ord == POrd_EQ || ord == POrd_LT);
+   }
+
+   /* since we're overwriting the VtsIDs in the SO, we need to drop
+      any references made by the previous contents thereof */
+   if (so->viR == VtsID_INVALID) {
+      tl_assert(so->viW == VtsID_INVALID);
+      so->viR = thr->viR;
+      so->viW = thr->viW;
+      VtsID__rcinc(so->viR);
+      VtsID__rcinc(so->viW);
+   } else {
+      /* In a strong send, we dump any previous VC in the SO and
+         install the sending thread's VC instead.  For a weak send we
+         must join2 with what's already there. */
+      tl_assert(so->viW != VtsID_INVALID);
+      VtsID__rcdec(so->viR);
+      VtsID__rcdec(so->viW);
+      so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
+      so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
+      VtsID__rcinc(so->viR);
+      VtsID__rcinc(so->viW);
+   }
+
+   /* move both parent clocks along */
+   VtsID__rcdec(thr->viR);
+   VtsID__rcdec(thr->viW);
+   thr->viR = VtsID__tick( thr->viR, thr );
+   thr->viW = VtsID__tick( thr->viW, thr );
+   VtsID__rcinc(thr->viR);
+   VtsID__rcinc(thr->viW);
+   if (strong_send)
+      show_thread_state("s-send", thr);
+   else
+      show_thread_state("w-send", thr);
+}
+
+void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
+{
+   tl_assert(so);
+   tl_assert(so->magic == SO_MAGIC);
+
+   if (so->viR != VtsID_INVALID) {
+      tl_assert(so->viW != VtsID_INVALID);
+
+      /* Weak receive (basically, an R-acquisition of a R-W lock).
+         This advances the read-clock of the receiver, but not the
+         write-clock. */
+      VtsID__rcdec(thr->viR);
+      thr->viR = VtsID__join2( thr->viR, so->viR );
+      VtsID__rcinc(thr->viR);
+
+      /* For a strong receive, we also advance the receiver's write
+         clock, which means the receive as a whole is essentially
+         equivalent to a W-acquisition of a R-W lock. */
+      if (strong_recv) {
+         VtsID__rcdec(thr->viW);
+         thr->viW = VtsID__join2( thr->viW, so->viW );
+         VtsID__rcinc(thr->viW);
+      }
+
+      if (strong_recv) 
+         show_thread_state("s-recv", thr);
+      else 
+         show_thread_state("w-recv", thr);
+
+   } else {
+      tl_assert(so->viW == VtsID_INVALID);
+      /* Deal with degenerate case: 'so' has no vts, so there has been
+         no message posted to it.  Just ignore this case. */
+      show_thread_state("d-recv", thr);
+   }
+}
+
+Bool libhb_so_everSent ( SO* so )
+{
+   if (so->viR == VtsID_INVALID) {
+      tl_assert(so->viW == VtsID_INVALID);
+      return False;
+   } else {
+      tl_assert(so->viW != VtsID_INVALID);
+      return True;
+   }
+}
+
+#define XXX1 0 // 0x67a106c
+#define XXX2 0
+
+static Bool TRACEME(Addr a, SizeT szB) {
+   if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
+   if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
+   return False;
+}
+static void trace ( Thr* thr, Addr a, SizeT szB, HChar* s ) {
+  SVal sv = zsm_read8(a);
+  VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
+  show_thread_state("", thr);
+  VG_(printf)("%s","\n");
+}
+
+void libhb_range_new ( Thr* thr, Addr a, SizeT szB )
+{
+   SVal sv = SVal__mkC(thr->viW, thr->viW);
+   tl_assert(is_sane_SVal_C(sv));
+   if(TRACEME(a,szB))trace(thr,a,szB,"nw-before");
+   zsm_set_range( a, szB, sv );
+   if(TRACEME(a,szB))trace(thr,a,szB,"nw-after ");
+}
+
+void libhb_range_noaccess ( Thr* thr, Addr a, SizeT szB )
+{
+   if(TRACEME(a,szB))trace(thr,a,szB,"NA-before");
+   zsm_set_range( a, szB, SVal__mkA() );
+   if(TRACEME(a,szB))trace(thr,a,szB,"NA-after ");
+}
+
+void* libhb_get_Thr_opaque ( Thr* thr ) {
+   tl_assert(thr);
+   return thr->opaque;
+}
+
+void libhb_set_Thr_opaque ( Thr* thr, void* v ) {
+   tl_assert(thr);
+   thr->opaque = v;
+}
+
+void libhb_copy_shadow_state ( Addr dst, Addr src, SizeT len )
+{
+   zsm_copy_range(dst, src, len);
+}
+
+void libhb_maybe_GC ( void )
+{
+   event_map_maybe_GC();
+   /* If there are still freelist entries available, no need for a
+      GC. */
+   if (vts_tab_freelist != VtsID_INVALID)
+      return;
+   /* So all the table entries are full, and we're having to expand
+      the table.  But did we hit the threshhold point yet? */
+   if (VG_(sizeXA)( vts_tab ) < vts_next_GC_at)
+      return;
+   vts_tab__do_GC( False/*don't show stats*/ );
+}
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+//                                                             //
+// SECTION END main library                                    //
+//                                                             //
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+/*--------------------------------------------------------------------*/
+/*--- end                                             libhb_main.c ---*/
+/*--------------------------------------------------------------------*/