Merge branches/OTRACK_BY_INSTRUMENTATION into the trunk.  This adds
support to Memcheck for tracking the origin of uninitialised values,
if you use the --track-origins=yes flag.

This currently causes some Memcheck regression tests to fail, because
they now print an extra line of advisory text in their output.  This
will be fixed.

The core-tool interface is slightly changed.  The version number for
the interface needs to be incremented.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@7982 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/m_aspacemgr/aspacemgr-aix5.c b/coregrind/m_aspacemgr/aspacemgr-aix5.c
index 98ddc89..e0b7312 100644
--- a/coregrind/m_aspacemgr/aspacemgr-aix5.c
+++ b/coregrind/m_aspacemgr/aspacemgr-aix5.c
@@ -1263,11 +1263,11 @@
 /* Get the filename corresponding to this segment, if known and if it
    has one.  The returned name's storage cannot be assumed to be
    persistent, so the caller should immediately copy the name
-   elsewhere. */
-HChar* VG_(am_get_filename)( NSegment* seg )
+   elsewhere.  On AIX5, we don't know what this is (in general)
+   so just return NULL. */
+HChar* VG_(am_get_filename)( NSegment const* seg )
 {
-   ML_(am_barf)("unimplemented: VG_(am_get_filename)");
-   return NULL; /* placate gcc -Wall */
+   return NULL;
 }
 
 /* Collect up the start addresses of all non-free, non-resvn segments.
diff --git a/coregrind/m_debuginfo/debuginfo.c b/coregrind/m_debuginfo/debuginfo.c
index c2fb5b8..14ba2b0 100644
--- a/coregrind/m_debuginfo/debuginfo.c
+++ b/coregrind/m_debuginfo/debuginfo.c
@@ -1056,6 +1056,8 @@
    const NSegment *seg;
    HChar* filename;
    vg_assert(nbuf > 0);
+   /* Look in the debugInfo_list to find the name.  In most cases we
+      expect this to produce a result. */
    for (di = debugInfo_list; di != NULL; di = di->next) {
       if (di->text_present
           && di->text_avma <= a 
@@ -1076,9 +1078,13 @@
          return True;
       }
    }
-   if ((seg = VG_(am_find_nsegment(a))) != NULL &&
-       (filename = VG_(am_get_filename)(seg)) != NULL)
-   {
+   /* Last-ditch fallback position: if we don't find the address in
+      the debugInfo_list, ask the address space manager whether it
+      knows the name of the file associated with this mapping.  This
+      allows us to print the names of exe/dll files in the stack trace
+      when running programs under wine. */
+   if ( (seg = VG_(am_find_nsegment(a))) != NULL 
+        && (filename = VG_(am_get_filename)(seg)) != NULL ) {
       VG_(strncpy_safely)(buf, filename, nbuf);
       return True;
    }
diff --git a/coregrind/m_debuginfo/readdwarf3.c b/coregrind/m_debuginfo/readdwarf3.c
index 18bc2c3..263bd41 100644
--- a/coregrind/m_debuginfo/readdwarf3.c
+++ b/coregrind/m_debuginfo/readdwarf3.c
@@ -1164,7 +1164,7 @@
    }
    TempVar;
 
-#define N_D3_VAR_STACK 24
+#define N_D3_VAR_STACK 48
 
 typedef
    struct {
diff --git a/coregrind/m_debuginfo/readelf.c b/coregrind/m_debuginfo/readelf.c
index 24aeb5b..5d33dd8 100644
--- a/coregrind/m_debuginfo/readelf.c
+++ b/coregrind/m_debuginfo/readelf.c
@@ -1393,6 +1393,27 @@
                          di->bss_avma + di->bss_size - 1);
             TRACE_SYMTAB("acquiring .bss bias = %p\n", di->bss_bias);
          } else
+
+         /* Now one from the wtf?! department ... */
+         if (inrx && (!inrw) && size >= 0 && !di->bss_present) {
+            /* File contains a .bss, but it got mapped as rx only.
+               This is very strange.  For now, just pretend we didn't
+               see it :-) */
+            di->bss_present = False;
+            di->bss_svma = 0;
+            di->bss_avma = 0;
+            di->bss_size = 0;
+            di->bss_bias = 0;
+            bss_align = 0;
+            if (!VG_(clo_xml)) {
+               VG_(message)(Vg_UserMsg, "Warning: the following file's .bss is "
+                                       "mapped r-x only - ignoring .bss syms");
+               VG_(message)(Vg_UserMsg,   " %s", di->filename 
+                                                    ? di->filename
+                                                    : (UChar*)"(null?!)" );
+            }
+         } else
+
          if ((!inrw) && (!inrx) && size > 0 && !di->bss_present) {
             /* File contains a .bss, but it didn't get mapped.  Ignore. */
             di->bss_present = False;
diff --git a/coregrind/m_errormgr.c b/coregrind/m_errormgr.c
index 535252f..9a71a09 100644
--- a/coregrind/m_errormgr.c
+++ b/coregrind/m_errormgr.c
@@ -442,7 +442,8 @@
    }
 
    // Print stack trace elements
-   VG_(apply_StackTrace)(printSuppForIp, VG_(extract_StackTrace)(ec), stop_at);
+   VG_(apply_StackTrace)(printSuppForIp,
+                         VG_(get_ExeContext_StackTrace)(ec), stop_at);
 
    VG_(printf)("}\n");
 }
@@ -788,7 +789,7 @@
       pp_Error( p_min );
 
       if ((i+1 == VG_(clo_dump_error))) {
-         StackTrace ips = VG_(extract_StackTrace)(p_min->where);
+         StackTrace ips = VG_(get_ExeContext_StackTrace)(p_min->where);
          VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/,
                           ips[0], /*debugging*/True, 0xFE/*verbosity*/,
                           /*bbs_done*/0,
@@ -1148,7 +1149,7 @@
 {
    Int i;
    Char caller_name[ERRTXT_LEN];
-   StackTrace ips = VG_(extract_StackTrace)(err->where);
+   StackTrace ips = VG_(get_ExeContext_StackTrace)(err->where);
 
    for (i = 0; i < su->n_callers; i++) {
       Addr a = ips[i];
diff --git a/coregrind/m_execontext.c b/coregrind/m_execontext.c
index 51b3d82..b52cbfc 100644
--- a/coregrind/m_execontext.c
+++ b/coregrind/m_execontext.c
@@ -75,10 +75,16 @@
 
 struct _ExeContext {
    struct _ExeContext* chain;
-   UInt   n_ips;
+   /* A 32-bit unsigned integer that uniquely identifies this
+      ExeContext.  Memcheck uses these for origin tracking.  Values
+      must be nonzero (else Memcheck's origin tracking is hosed), must
+      be a multiple of four, and must be unique.  Hence they start at
+      4. */
+   UInt ecu;
    /* Variable-length array.  The size is 'n_ips'; at
       least 1, at most VG_DEEPEST_BACKTRACE.  [0] is the current IP,
       [1] is its caller, [2] is the caller of [1], etc. */
+   UInt n_ips;
    Addr ips[0];
 };
 
@@ -88,6 +94,9 @@
 static SizeT        ec_htab_size;     /* one of the values in ec_primes */
 static SizeT        ec_htab_size_idx; /* 0 .. N_EC_PRIMES-1 */
 
+/* ECU serial number */
+static UInt ec_next_ecu = 4; /* We must never issue zero */
+
 
 /* Stats only: the number of times the system was searched to locate a
    context. */
@@ -115,7 +124,7 @@
 {
    Int i;
    static Bool init_done = False;
-   if (init_done)
+   if (LIKELY(init_done))
       return;
    ec_searchreqs = 0;
    ec_searchcmps = 0;
@@ -280,29 +289,27 @@
    ec_htab_size_idx++;
 }
 
+/* Do the first part of getting a stack trace: actually unwind the
+   stack, and hand the results off to the duplicate-trace-finder
+   (_wrk2). */
+static ExeContext* record_ExeContext_wrk2 ( Addr* ips, UInt n_ips ); /*fwds*/
 static ExeContext* record_ExeContext_wrk ( ThreadId tid, Word first_ip_delta,
                                            Bool first_ip_only )
 {
-   Int         i;
-   Addr        ips[VG_DEEPEST_BACKTRACE];
-   Bool        same;
-   UWord       hash;
-   ExeContext* new_ec;
-   ExeContext* list;
-   UInt        n_ips;
-   ExeContext  *prev2, *prev;
+   Addr ips[VG_DEEPEST_BACKTRACE];
+   UInt n_ips;
 
-   static UInt ctr = 0;
+   init_ExeContext_storage();
 
    vg_assert(sizeof(void*) == sizeof(UWord));
    vg_assert(sizeof(void*) == sizeof(Addr));
 
-   init_ExeContext_storage();
+   vg_assert(VG_(is_valid_tid)(tid));
+
    vg_assert(VG_(clo_backtrace_size) >= 1 &&
              VG_(clo_backtrace_size) <= VG_DEEPEST_BACKTRACE);
 
    if (first_ip_only) {
-      vg_assert(VG_(is_valid_tid)(tid));
       n_ips = 1;
       ips[0] = VG_(get_IP)(tid);
    } else {
@@ -312,6 +319,24 @@
                                    first_ip_delta );
    }
 
+   return record_ExeContext_wrk2 ( &ips[0], n_ips );
+}
+
+/* Do the second part of getting a stack trace: ips[0 .. n_ips-1]
+   holds a proposed trace.  Find or allocate a suitable ExeContext.
+   Note that callers must have done init_ExeContext_storage() before
+   getting to this point. */
+static ExeContext* record_ExeContext_wrk2 ( Addr* ips, UInt n_ips )
+{
+   Int         i;
+   Bool        same;
+   UWord       hash;
+   ExeContext* new_ec;
+   ExeContext* list;
+   ExeContext  *prev2, *prev;
+
+   static UInt ctr = 0;
+
    tl_assert(n_ips >= 1 && n_ips <= VG_(clo_backtrace_size));
 
    /* Now figure out if we've seen this one before.  First hash it so
@@ -377,6 +402,15 @@
    for (i = 0; i < n_ips; i++)
       new_ec->ips[i] = ips[i];
 
+   vg_assert(VG_(is_plausible_ECU)(ec_next_ecu));
+   new_ec->ecu = ec_next_ecu;
+   ec_next_ecu += 4;
+   if (ec_next_ecu == 0) {
+      /* Urr.  Now we're hosed; we emitted 2^30 ExeContexts already
+         and have run out of numbers.  Not sure what to do. */
+      VG_(core_panic)("m_execontext: more than 2^30 ExeContexts created");
+   }
+
    new_ec->n_ips = n_ips;
    new_ec->chain = ec_htab[hash];
    ec_htab[hash] = new_ec;
@@ -401,12 +435,40 @@
                                       True/*first_ip_only*/ );
 }
 
+ExeContext* VG_(make_depth_1_ExeContext_from_Addr)( Addr a ) {
+   init_ExeContext_storage();
+   return record_ExeContext_wrk2( &a, 1 );
+}
 
-StackTrace VG_(extract_StackTrace) ( ExeContext* e )
-{                                  
+StackTrace VG_(get_ExeContext_StackTrace) ( ExeContext* e ) {
    return e->ips;
 }  
 
+UInt VG_(get_ECU_from_ExeContext)( ExeContext* e ) {
+   vg_assert(VG_(is_plausible_ECU)(e->ecu));
+   return e->ecu;
+}
+
+Int VG_(get_ExeContext_n_ips)( ExeContext* e ) {
+   vg_assert(e->n_ips >= 1);
+   return e->n_ips;
+}
+
+ExeContext* VG_(get_ExeContext_from_ECU)( UInt ecu )
+{
+   UWord i;
+   ExeContext* ec;
+   vg_assert(VG_(is_plausible_ECU)(ecu));
+   vg_assert(ec_htab_size > 0);
+   for (i = 0; i < ec_htab_size; i++) {
+      for (ec = ec_htab[i]; ec; ec = ec->chain) {
+         if (ec->ecu == ecu)
+            return ec;
+      }
+   }
+   return NULL;
+}
+
 /*--------------------------------------------------------------------*/
 /*--- end                                           m_execontext.c ---*/
 /*--------------------------------------------------------------------*/
diff --git a/coregrind/m_initimg/initimg-aix5.c b/coregrind/m_initimg/initimg-aix5.c
index f4f2c65..0a7b6b3 100644
--- a/coregrind/m_initimg/initimg-aix5.c
+++ b/coregrind/m_initimg/initimg-aix5.c
@@ -298,8 +298,9 @@
       sane way. */
    LibVEX_GuestPPC32_initialise(&arch->vex);
 
-   /* Zero out the shadow area. */
-   VG_(memset)(&arch->vex_shadow, 0, sizeof(VexGuestPPC32State));
+   /* Zero out the shadow areas. */
+   VG_(memset)(&arch->vex_shadow1, 0, sizeof(VexGuestPPC32State));
+   VG_(memset)(&arch->vex_shadow2, 0, sizeof(VexGuestPPC32State));
 
 #  else /* defined(VGP_ppc64_aix5) */
 
@@ -309,8 +310,9 @@
       sane way. */
    LibVEX_GuestPPC64_initialise(&arch->vex);
 
-   /* Zero out the shadow area. */
-   VG_(memset)(&arch->vex_shadow, 0, sizeof(VexGuestPPC64State));
+   /* Zero out the shadow areas. */
+   VG_(memset)(&arch->vex_shadow1, 0, sizeof(VexGuestPPC64State));
+   VG_(memset)(&arch->vex_shadow2, 0, sizeof(VexGuestPPC64State));
 
 #  endif
 
diff --git a/coregrind/m_initimg/initimg-linux.c b/coregrind/m_initimg/initimg-linux.c
index 725fd5c..bad4941 100644
--- a/coregrind/m_initimg/initimg-linux.c
+++ b/coregrind/m_initimg/initimg-linux.c
@@ -962,8 +962,9 @@
       sane way. */
    LibVEX_GuestX86_initialise(&arch->vex);
 
-   /* Zero out the shadow area. */
-   VG_(memset)(&arch->vex_shadow, 0, sizeof(VexGuestX86State));
+   /* Zero out the shadow areas. */
+   VG_(memset)(&arch->vex_shadow1, 0, sizeof(VexGuestX86State));
+   VG_(memset)(&arch->vex_shadow2, 0, sizeof(VexGuestX86State));
 
    /* Put essential stuff into the new state. */
    arch->vex.guest_ESP = iifii.initial_client_SP;
@@ -982,8 +983,9 @@
       sane way. */
    LibVEX_GuestAMD64_initialise(&arch->vex);
 
-   /* Zero out the shadow area. */
-   VG_(memset)(&arch->vex_shadow, 0, sizeof(VexGuestAMD64State));
+   /* Zero out the shadow areas. */
+   VG_(memset)(&arch->vex_shadow1, 0, sizeof(VexGuestAMD64State));
+   VG_(memset)(&arch->vex_shadow2, 0, sizeof(VexGuestAMD64State));
 
    /* Put essential stuff into the new state. */
    arch->vex.guest_RSP = iifii.initial_client_SP;
@@ -996,8 +998,9 @@
       sane way. */
    LibVEX_GuestPPC32_initialise(&arch->vex);
 
-   /* Zero out the shadow area. */
-   VG_(memset)(&arch->vex_shadow, 0, sizeof(VexGuestPPC32State));
+   /* Zero out the shadow areas. */
+   VG_(memset)(&arch->vex_shadow1, 0, sizeof(VexGuestPPC32State));
+   VG_(memset)(&arch->vex_shadow2, 0, sizeof(VexGuestPPC32State));
 
    /* Put essential stuff into the new state. */
    arch->vex.guest_GPR1 = iifii.initial_client_SP;
@@ -1010,8 +1013,9 @@
       sane way. */
    LibVEX_GuestPPC64_initialise(&arch->vex);
 
-   /* Zero out the shadow area. */
-   VG_(memset)(&arch->vex_shadow, 0, sizeof(VexGuestPPC64State));
+   /* Zero out the shadow areas. */
+   VG_(memset)(&arch->vex_shadow1, 0, sizeof(VexGuestPPC64State));
+   VG_(memset)(&arch->vex_shadow2, 0, sizeof(VexGuestPPC64State));
 
    /* Put essential stuff into the new state. */
    arch->vex.guest_GPR1 = iifii.initial_client_SP;
diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c
index 3bf1246..8cc9989 100644
--- a/coregrind/m_machine.c
+++ b/coregrind/m_machine.c
@@ -79,34 +79,44 @@
 }
 
 
-void VG_(get_shadow_regs_area) ( ThreadId tid, OffT offset, SizeT size,
-                                 UChar* area )
+void
+VG_(get_shadow_regs_area) ( ThreadId tid, 
+                            /*DST*/UChar* dst,
+                            /*SRC*/Int shadowNo, OffT offset, SizeT size )
 {
+   void*        src;
    ThreadState* tst;
-
+   vg_assert(shadowNo == 1 || shadowNo == 2);
    vg_assert(VG_(is_valid_tid)(tid));
-   tst = & VG_(threads)[tid];
-
    // Bounds check
    vg_assert(0 <= offset && offset < sizeof(VexGuestArchState));
    vg_assert(offset + size <= sizeof(VexGuestArchState));
-
-   VG_(memcpy)( area, (void*)(((Addr)&(tst->arch.vex_shadow)) + offset), size);
+   // Copy
+   tst = & VG_(threads)[tid];
+   src = shadowNo == 1
+            ? (void*)(((Addr)&(tst->arch.vex_shadow1)) + offset)
+            : (void*)(((Addr)&(tst->arch.vex_shadow2)) + offset);
+   VG_(memcpy)( dst, src, size);
 }
 
-void VG_(set_shadow_regs_area) ( ThreadId tid, OffT offset, SizeT size,
-                                 const UChar* area )
+void
+VG_(set_shadow_regs_area) ( ThreadId tid, 
+                            /*DST*/Int shadowNo, OffT offset, SizeT size,
+                            /*SRC*/const UChar* src )
 {
+   void*        dst;
    ThreadState* tst;
-
+   vg_assert(shadowNo == 1 || shadowNo == 2);
    vg_assert(VG_(is_valid_tid)(tid));
-   tst = & VG_(threads)[tid];
-
    // Bounds check
    vg_assert(0 <= offset && offset < sizeof(VexGuestArchState));
    vg_assert(offset + size <= sizeof(VexGuestArchState));
-
-   VG_(memcpy)( (void*)(((Addr)(&tst->arch.vex_shadow)) + offset), area, size);
+   // Copy
+   tst = & VG_(threads)[tid];
+   dst = shadowNo == 1
+            ? (void*)(((Addr)&(tst->arch.vex_shadow1)) + offset)
+            : (void*)(((Addr)&(tst->arch.vex_shadow2)) + offset);
+   VG_(memcpy)( dst, src, size);
 }
 
 
diff --git a/coregrind/m_main.c b/coregrind/m_main.c
index 5914a2f..551349f 100644
--- a/coregrind/m_main.c
+++ b/coregrind/m_main.c
@@ -743,7 +743,7 @@
       VG_(message)(Vg_UserMsg, "");
       VG_(message)(Vg_UserMsg, "<valgrindoutput>");
       VG_(message)(Vg_UserMsg, "");
-      VG_(message)(Vg_UserMsg, "<protocolversion>2</protocolversion>");
+      VG_(message)(Vg_UserMsg, "<protocolversion>3</protocolversion>");
       VG_(message)(Vg_UserMsg, "");
    }
 
@@ -1556,16 +1556,8 @@
    //   p: setup_client_stack()      [for 'VG_(client_arg[cv]']
    //   p: setup_file_descriptors()  [for 'VG_(fd_xxx_limit)']
    //--------------------------------------------------------------
-   {
-      Char* s;
-      Bool  ok;
-      VG_(debugLog)(1, "main", "Initialise the tool part 1 (pre_clo_init)\n");
-      (VG_(tool_info).tl_pre_clo_init)();
-      ok = VG_(sanity_check_needs)( &s );
-      if (!ok) {
-         VG_(tool_panic)(s);
-      }
-   }
+   VG_(debugLog)(1, "main", "Initialise the tool part 1 (pre_clo_init)\n");
+   (VG_(tool_info).tl_pre_clo_init)();
 
    //--------------------------------------------------------------
    // If --tool and --help/--help-debug was given, now give the core+tool
@@ -1614,6 +1606,17 @@
    //--------------------------------------------------------------
    VG_(debugLog)(1, "main", "Initialise the tool part 2 (post_clo_init)\n");
    VG_TDICT_CALL(tool_post_clo_init);
+   {
+      /* The tool's "needs" will by now be finalised, since it has no
+         further opportunity to specify them.  So now sanity check
+         them. */
+      Char* s;
+      Bool  ok;
+      ok = VG_(sanity_check_needs)( &s );
+      if (!ok) {
+         VG_(tool_panic)(s);
+      }
+   }
 
    //--------------------------------------------------------------
    // Initialise translation table and translation cache
diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c
index 797104f..6cc9571 100644
--- a/coregrind/m_scheduler/scheduler.c
+++ b/coregrind/m_scheduler/scheduler.c
@@ -225,7 +225,9 @@
    vg_assert(VG_(running_tid) == VG_INVALID_THREADID);
    VG_(running_tid) = tid;
 
-   VG_(unknown_SP_update)(VG_(get_SP(tid)), VG_(get_SP(tid)));
+   { Addr gsp = VG_(get_SP)(tid);
+     VG_(unknown_SP_update)(gsp, gsp, 0/*unknown origin*/);
+   }
 
    if (VG_(clo_trace_sched)) {
       HChar buf[150];
@@ -529,40 +531,64 @@
 
 /* Do various guest state alignment checks prior to running a thread.
    Specifically, check that what we have matches Vex's guest state
-   layout requirements. */
-static void do_pre_run_checks ( volatile ThreadState* tst )
+   layout requirements.  See libvex.h for details, but in short the
+   requirements are: There must be no holes in between the primary
+   guest state, its two copies, and the spill area.  In short, all 4
+   areas must have a 16-aligned size and be 16-aligned, and placed
+   back-to-back. */
+static void do_pre_run_checks ( ThreadState* tst )
 {
-   Addr a_vex    = (Addr) & tst->arch.vex;
-   Addr a_vexsh  = (Addr) & tst->arch.vex_shadow;
-   Addr a_spill  = (Addr) & tst->arch.vex_spill;
-   UInt sz_vex   = (UInt) sizeof tst->arch.vex;
-   UInt sz_vexsh = (UInt) sizeof tst->arch.vex_shadow;
-   UInt sz_spill = (UInt) sizeof tst->arch.vex_spill;
+   Addr a_vex     = (Addr) & tst->arch.vex;
+   Addr a_vexsh1  = (Addr) & tst->arch.vex_shadow1;
+   Addr a_vexsh2  = (Addr) & tst->arch.vex_shadow2;
+   Addr a_spill   = (Addr) & tst->arch.vex_spill;
+   UInt sz_vex    = (UInt) sizeof tst->arch.vex;
+   UInt sz_vexsh1 = (UInt) sizeof tst->arch.vex_shadow1;
+   UInt sz_vexsh2 = (UInt) sizeof tst->arch.vex_shadow2;
+   UInt sz_spill  = (UInt) sizeof tst->arch.vex_spill;
 
    if (0)
-   VG_(printf)("%p %d %p %d %p %d\n",
-               (void*)a_vex, sz_vex, (void*)a_vexsh, sz_vexsh,
+   VG_(printf)("gst %p %d, sh1 %p %d, "
+               "sh2 %p %d, spill %p %d\n",
+               (void*)a_vex, sz_vex,
+               (void*)a_vexsh1, sz_vexsh1,
+               (void*)a_vexsh2, sz_vexsh2,
                (void*)a_spill, sz_spill );
 
-   vg_assert(VG_IS_8_ALIGNED(sz_vex));
-   vg_assert(VG_IS_8_ALIGNED(sz_vexsh));
+   vg_assert(VG_IS_16_ALIGNED(sz_vex));
+   vg_assert(VG_IS_16_ALIGNED(sz_vexsh1));
+   vg_assert(VG_IS_16_ALIGNED(sz_vexsh2));
    vg_assert(VG_IS_16_ALIGNED(sz_spill));
 
-   vg_assert(VG_IS_4_ALIGNED(a_vex));
-   vg_assert(VG_IS_4_ALIGNED(a_vexsh));
-   vg_assert(VG_IS_4_ALIGNED(a_spill));
+   vg_assert(VG_IS_16_ALIGNED(a_vex));
+   vg_assert(VG_IS_16_ALIGNED(a_vexsh1));
+   vg_assert(VG_IS_16_ALIGNED(a_vexsh2));
+   vg_assert(VG_IS_16_ALIGNED(a_spill));
 
-   vg_assert(sz_vex == sz_vexsh);
-   vg_assert(a_vex + sz_vex == a_vexsh);
-
+   /* Check that the guest state and its two shadows have the same
+      size, and that there are no holes in between.  The latter is
+      important because Memcheck assumes that it can reliably access
+      the shadows by indexing off a pointer to the start of the
+      primary guest state area. */
+   vg_assert(sz_vex == sz_vexsh1);
+   vg_assert(sz_vex == sz_vexsh2);
+   vg_assert(a_vex + 1 * sz_vex == a_vexsh1);
+   vg_assert(a_vex + 2 * sz_vex == a_vexsh2);
+   /* Also check there's no hole between the second shadow area and
+      the spill area. */
    vg_assert(sz_spill == LibVEX_N_SPILL_BYTES);
-   vg_assert(a_vex + 2 * sz_vex == a_spill);
+   vg_assert(a_vex + 3 * sz_vex == a_spill);
 
 #  if defined(VGA_ppc32) || defined(VGA_ppc64)
    /* ppc guest_state vector regs must be 16 byte aligned for
-      loads/stores */
+      loads/stores.  This is important! */
    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_VR0));
-   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow.guest_VR0));
+   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_VR0));
+   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_VR0));
+   /* be extra paranoid .. */
+   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_VR1));
+   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_VR1));
+   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_VR1));
 #  endif   
 }
 
@@ -583,7 +609,7 @@
    vg_assert(!VG_(is_exiting)(tid));
 
    tst = VG_(get_ThreadState)(tid);
-   do_pre_run_checks(tst);
+   do_pre_run_checks( (ThreadState*)tst );
    /* end Paranoia */
 
    trc = 0;
@@ -686,7 +712,7 @@
    vg_assert(!VG_(is_exiting)(tid));
 
    tst = VG_(get_ThreadState)(tid);
-   do_pre_run_checks(tst);
+   do_pre_run_checks( (ThreadState*)tst );
    /* end Paranoia */
 
 #  if defined(VGA_ppc32) || defined(VGA_ppc64)
diff --git a/coregrind/m_sigframe/sigframe-amd64-linux.c b/coregrind/m_sigframe/sigframe-amd64-linux.c
index 66a1db7..f042a8d 100644
--- a/coregrind/m_sigframe/sigframe-amd64-linux.c
+++ b/coregrind/m_sigframe/sigframe-amd64-linux.c
@@ -93,7 +93,8 @@
 
    /* XXX This is wrong.  Surely we should store the shadow values
       into the shadow memory behind the actual values? */
-   VexGuestAMD64State vex_shadow;
+   VexGuestAMD64State vex_shadow1;
+   VexGuestAMD64State vex_shadow2;
 
    /* HACK ALERT */
    VexGuestAMD64State vex;
@@ -374,7 +375,7 @@
 */
 static Bool extend ( ThreadState *tst, Addr addr, SizeT size )
 {
-   ThreadId tid = tst->tid;
+   ThreadId        tid = tst->tid;
    NSegment const* stackseg = NULL;
 
    if (VG_(extend_stack)(addr, tst->client_stack_szB)) {
@@ -406,7 +407,7 @@
    /* For tracking memory events, indicate the entire frame has been
       allocated. */
    VG_TRACK( new_mem_stack_signal, addr - VG_STACK_REDZONE_SZB,
-             size + VG_STACK_REDZONE_SZB );
+             size + VG_STACK_REDZONE_SZB, tid );
 
    return True;
 }
@@ -422,7 +423,8 @@
 {
    frame->sigNo_private = sigNo;
    frame->magicPI       = 0x31415927;
-   frame->vex_shadow    = tst->arch.vex_shadow;
+   frame->vex_shadow1   = tst->arch.vex_shadow1;
+   frame->vex_shadow2   = tst->arch.vex_shadow2;
    /* HACK ALERT */
    frame->vex           = tst->arch.vex;
    /* end HACK ALERT */
@@ -541,13 +543,14 @@
       *sigNo = VKI_SIGSEGV;
       return False;
    }
-   tst->sig_mask        = frame->mask;
-   tst->tmp_sig_mask    = frame->mask;
-   tst->arch.vex_shadow = frame->vex_shadow;
+   tst->sig_mask         = frame->mask;
+   tst->tmp_sig_mask     = frame->mask;
+   tst->arch.vex_shadow1 = frame->vex_shadow1;
+   tst->arch.vex_shadow2 = frame->vex_shadow2;
    /* HACK ALERT */
-   tst->arch.vex        = frame->vex;
+   tst->arch.vex         = frame->vex;
    /* end HACK ALERT */
-   *sigNo               = frame->sigNo_private;
+   *sigNo                = frame->sigNo_private;
    return True;
 }
 
diff --git a/coregrind/m_sigframe/sigframe-ppc32-aix5.c b/coregrind/m_sigframe/sigframe-ppc32-aix5.c
index 78390c6..2cad8bf 100644
--- a/coregrind/m_sigframe/sigframe-ppc32-aix5.c
+++ b/coregrind/m_sigframe/sigframe-ppc32-aix5.c
@@ -63,7 +63,8 @@
 struct hacky_sigframe {
    UChar              lower_guardzone[512];  // put nothing here
    VexGuestPPC32State gst;
-   VexGuestPPC32State gshadow;
+   VexGuestPPC32State gshadow1;
+   VexGuestPPC32State gshadow2;
    UInt               magicPI;
    UInt               sigNo_private;
    UInt               tramp[2];
@@ -77,10 +78,15 @@
 */
 static Bool extend ( ThreadState *tst, Addr addr, SizeT size )
 {
+   ThreadId tid = tst->tid;
    /* For tracking memory events, indicate the entire frame has been
       allocated.  Except, don't mess with the area which
       overlaps the previous frame's redzone. */
-   VG_TRACK( new_mem_stack_signal, addr, size - VG_STACK_REDZONE_SZB );
+   /* XXX is the following call really right?  compared with the
+      amd64-linux version, this doesn't appear to handle the redzone
+      in the same way. */
+   VG_TRACK( new_mem_stack_signal,
+             addr, size - VG_STACK_REDZONE_SZB, tid );
    return True;
 }
 
@@ -130,12 +136,14 @@
 
    /* clear it (very conservatively) */
    VG_(memset)(&frame->lower_guardzone, 0, 512);
-   VG_(memset)(&frame->gst,     0, sizeof(VexGuestPPC32State));
-   VG_(memset)(&frame->gshadow, 0, sizeof(VexGuestPPC32State));
+   VG_(memset)(&frame->gst,      0, sizeof(VexGuestPPC32State));
+   VG_(memset)(&frame->gshadow1, 0, sizeof(VexGuestPPC32State));
+   VG_(memset)(&frame->gshadow2, 0, sizeof(VexGuestPPC32State));
 
    /* save stuff in frame */
    frame->gst           = tst->arch.vex;
-   frame->gshadow       = tst->arch.vex_shadow;
+   frame->gshadow1      = tst->arch.vex_shadow1;
+   frame->gshadow2      = tst->arch.vex_shadow2;
    frame->sigNo_private = sigNo;
    frame->magicPI       = 0x31415927;
 
@@ -201,11 +209,12 @@
    frame = (struct hacky_sigframe*)(sp - 256);
    vg_assert(frame->magicPI == 0x31415927);
 
-   /* restore the entire guest state, and shadow, from the
+   /* restore the entire guest state, and shadows, from the
       frame.  Note, as per comments above, this is a kludge - should
       restore it from saved ucontext.  Oh well. */
    tst->arch.vex = frame->gst;
-   tst->arch.vex_shadow = frame->gshadow;
+   tst->arch.vex_shadow1 = frame->gshadow1;
+   tst->arch.vex_shadow2 = frame->gshadow2;
    sigNo = frame->sigNo_private;
 
    if (VG_(clo_trace_signals))
diff --git a/coregrind/m_sigframe/sigframe-ppc32-linux.c b/coregrind/m_sigframe/sigframe-ppc32-linux.c
index acd6450..261a305 100644
--- a/coregrind/m_sigframe/sigframe-ppc32-linux.c
+++ b/coregrind/m_sigframe/sigframe-ppc32-linux.c
@@ -94,7 +94,8 @@
 struct vg_sig_private {
    UInt magicPI;
    UInt sigNo_private;
-   VexGuestPPC32State shadow;
+   VexGuestPPC32State vex_shadow1;
+   VexGuestPPC32State vex_shadow2;
 };
 
 /* Structure put on stack for signal handlers with SA_SIGINFO clear. */
@@ -504,8 +505,8 @@
 */
 static Bool extend ( ThreadState *tst, Addr addr, SizeT size )
 {
-   ThreadId tid = tst->tid;
-   NSegment const *stackseg = NULL;
+   ThreadId        tid = tst->tid;
+   NSegment const* stackseg = NULL;
 
    if (VG_(extend_stack)(addr, tst->client_stack_szB)) {
       stackseg = VG_(am_find_nsegment)(addr);
@@ -536,7 +537,7 @@
    /* For tracking memory events, indicate the entire frame has been
       allocated. */
    VG_TRACK( new_mem_stack_signal, addr - VG_STACK_REDZONE_SZB,
-             size + VG_STACK_REDZONE_SZB );
+             size + VG_STACK_REDZONE_SZB, tid );
 
    return True;
 }
@@ -761,7 +762,8 @@
 
    priv->magicPI       = 0x31415927;
    priv->sigNo_private = sigNo;
-   priv->shadow        = tst->arch.vex_shadow;
+   priv->vex_shadow1   = tst->arch.vex_shadow1;
+   priv->vex_shadow2   = tst->arch.vex_shadow2;
 
    SET_SIGNAL_GPR(tid, 1, sp);
    SET_SIGNAL_GPR(tid, 3, sigNo);
@@ -931,7 +933,8 @@
    tst->arch.vex.guest_CTR = mc->mc_gregs[VKI_PT_CTR];
    LibVEX_GuestPPC32_put_XER( mc->mc_gregs[VKI_PT_XER], &tst->arch.vex );
 
-   tst->arch.vex_shadow = priv->shadow;
+   tst->arch.vex_shadow1 = priv->vex_shadow1;
+   tst->arch.vex_shadow2 = priv->vex_shadow2;
 
    VG_TRACK(die_mem_stack_signal, sp, frame_size);
 
diff --git a/coregrind/m_sigframe/sigframe-ppc64-aix5.c b/coregrind/m_sigframe/sigframe-ppc64-aix5.c
index b0de2dc..610993e 100644
--- a/coregrind/m_sigframe/sigframe-ppc64-aix5.c
+++ b/coregrind/m_sigframe/sigframe-ppc64-aix5.c
@@ -63,7 +63,8 @@
 struct hacky_sigframe {
    UChar              lower_guardzone[1024];  // put nothing here
    VexGuestPPC64State gst;
-   VexGuestPPC64State gshadow;
+   VexGuestPPC64State gshadow1;
+   VexGuestPPC64State gshadow2;
    UInt               magicPI;
    UInt               sigNo_private;
    UInt               tramp[2];
@@ -77,10 +78,15 @@
 */
 static Bool extend ( ThreadState *tst, Addr addr, SizeT size )
 {
+   ThreadId tid = tst->tid;
    /* For tracking memory events, indicate the entire frame has been
       allocated.  Except, don't mess with the area which
       overlaps the previous frame's redzone. */
-   VG_TRACK( new_mem_stack_signal, addr, size - VG_STACK_REDZONE_SZB );
+   /* XXX is the following call really right?  compared with the
+      amd64-linux version, this doesn't appear to handle the redzone
+      in the same way. */
+   VG_TRACK( new_mem_stack_signal,
+             addr, size - VG_STACK_REDZONE_SZB, tid );
    return True;
 }
 
@@ -130,12 +136,14 @@
 
    /* clear it (very conservatively) */
    VG_(memset)(&frame->lower_guardzone, 0, 1024);
-   VG_(memset)(&frame->gst,     0, sizeof(VexGuestPPC64State));
-   VG_(memset)(&frame->gshadow, 0, sizeof(VexGuestPPC64State));
+   VG_(memset)(&frame->gst,      0, sizeof(VexGuestPPC64State));
+   VG_(memset)(&frame->gshadow1, 0, sizeof(VexGuestPPC64State));
+   VG_(memset)(&frame->gshadow2, 0, sizeof(VexGuestPPC64State));
 
    /* save stuff in frame */
    frame->gst           = tst->arch.vex;
-   frame->gshadow       = tst->arch.vex_shadow;
+   frame->gshadow1      = tst->arch.vex_shadow1;
+   frame->gshadow2      = tst->arch.vex_shadow2;
    frame->sigNo_private = sigNo;
    frame->magicPI       = 0x31415927;
 
@@ -239,7 +247,8 @@
       frame.  Note, as per comments above, this is a kludge - should
       restore it from saved ucontext.  Oh well. */
    tst->arch.vex = frame->gst;
-   tst->arch.vex_shadow = frame->gshadow;
+   tst->arch.vex_shadow1 = frame->gshadow1;
+   tst->arch.vex_shadow2 = frame->gshadow2;
    sigNo = frame->sigNo_private;
 
    if (VG_(clo_trace_signals))
diff --git a/coregrind/m_sigframe/sigframe-ppc64-linux.c b/coregrind/m_sigframe/sigframe-ppc64-linux.c
index 1e0701f..200b0d4 100644
--- a/coregrind/m_sigframe/sigframe-ppc64-linux.c
+++ b/coregrind/m_sigframe/sigframe-ppc64-linux.c
@@ -97,7 +97,8 @@
 struct vg_sig_private {
    UInt magicPI;
    UInt sigNo_private;
-   VexGuestPPC64State shadow;
+   VexGuestPPC64State vex_shadow1;
+   VexGuestPPC64State vex_shadow2;
 };
 
 /* Structure put on stack for all signal handlers. */
@@ -133,8 +134,8 @@
 */
 static Bool extend ( ThreadState *tst, Addr addr, SizeT size )
 {
-   ThreadId tid = tst->tid;
-   NSegment const *stackseg = NULL;
+   ThreadId        tid = tst->tid;
+   NSegment const* stackseg = NULL;
 
    if (VG_(extend_stack)(addr, tst->client_stack_szB)) {
       stackseg = VG_(am_find_nsegment)(addr);
@@ -165,7 +166,7 @@
    /* For tracking memory events, indicate the entire frame has been
       allocated. */
    VG_TRACK( new_mem_stack_signal, addr - VG_STACK_REDZONE_SZB,
-             size + VG_STACK_REDZONE_SZB );
+             size + VG_STACK_REDZONE_SZB, tid );
 
    return True;
 }
@@ -300,7 +301,8 @@
    priv = &frame->priv;
    priv->magicPI       = 0x31415927;
    priv->sigNo_private = sigNo;
-   priv->shadow        = tst->arch.vex_shadow;
+   priv->vex_shadow1   = tst->arch.vex_shadow1;
+   priv->vex_shadow2   = tst->arch.vex_shadow2;
 
    if (0)
       VG_(printf)("pushed signal frame; %R1 now = %p, "
@@ -364,7 +366,8 @@
    LibVEX_GuestPPC64_put_XER( frame->uc.uc_mcontext.gp_regs[VKI_PT_XER], 
                               &tst->arch.vex );
 
-   tst->arch.vex_shadow = priv->shadow;
+   tst->arch.vex_shadow1 = priv->vex_shadow1;
+   tst->arch.vex_shadow2 = priv->vex_shadow2;
 
    VG_TRACK(die_mem_stack_signal, sp, frame_size);
 
diff --git a/coregrind/m_sigframe/sigframe-x86-linux.c b/coregrind/m_sigframe/sigframe-x86-linux.c
index 42cc6d9..0f3e57d 100644
--- a/coregrind/m_sigframe/sigframe-x86-linux.c
+++ b/coregrind/m_sigframe/sigframe-x86-linux.c
@@ -99,7 +99,8 @@
 
    /* XXX This is wrong.  Surely we should store the shadow values
       into the shadow memory behind the actual values? */
-   VexGuestX86State vex_shadow;
+   VexGuestX86State vex_shadow1;
+   VexGuestX86State vex_shadow2;
 
    /* HACK ALERT */
    VexGuestX86State vex;
@@ -395,7 +396,7 @@
 */
 static Bool extend ( ThreadState *tst, Addr addr, SizeT size )
 {
-   ThreadId tid = tst->tid;
+   ThreadId        tid = tst->tid;
    NSegment const* stackseg = NULL;
 
    if (VG_(extend_stack)(addr, tst->client_stack_szB)) {
@@ -427,7 +428,7 @@
    /* For tracking memory events, indicate the entire frame has been
       allocated. */
    VG_TRACK( new_mem_stack_signal, addr - VG_STACK_REDZONE_SZB,
-             size + VG_STACK_REDZONE_SZB );
+             size + VG_STACK_REDZONE_SZB, tid );
 
    return True;
 }
@@ -443,7 +444,8 @@
 {
    frame->sigNo_private = sigNo;
    frame->magicPI       = 0x31415927;
-   frame->vex_shadow    = tst->arch.vex_shadow;
+   frame->vex_shadow1   = tst->arch.vex_shadow1;
+   frame->vex_shadow2   = tst->arch.vex_shadow2;
    /* HACK ALERT */
    frame->vex           = tst->arch.vex;
    /* end HACK ALERT */
@@ -622,13 +624,14 @@
       *sigNo = VKI_SIGSEGV;
       return False;
    }
-   tst->sig_mask        = frame->mask;
-   tst->tmp_sig_mask    = frame->mask;
-   tst->arch.vex_shadow = frame->vex_shadow;
+   tst->sig_mask         = frame->mask;
+   tst->tmp_sig_mask     = frame->mask;
+   tst->arch.vex_shadow1 = frame->vex_shadow1;
+   tst->arch.vex_shadow2 = frame->vex_shadow2;
    /* HACK ALERT */
-   tst->arch.vex        = frame->vex;
+   tst->arch.vex         = frame->vex;
    /* end HACK ALERT */
-   *sigNo               = frame->sigNo_private;
+   *sigNo                = frame->sigNo_private;
    return True;
 }
 
diff --git a/coregrind/m_stacks.c b/coregrind/m_stacks.c
index 33090b7..9a51525 100644
--- a/coregrind/m_stacks.c
+++ b/coregrind/m_stacks.c
@@ -272,8 +272,8 @@
    tracked by the tool, and one of the specialised cases
    (eg. new_mem_stack_4) isn't used in preference.  
 */
-VG_REGPARM(2)
-void VG_(unknown_SP_update)( Addr old_SP, Addr new_SP )
+VG_REGPARM(3)
+void VG_(unknown_SP_update)( Addr old_SP, Addr new_SP, UInt ecu )
 {
    static Int moans = 3;
    Word delta  = (Word)new_SP - (Word)old_SP;
@@ -315,7 +315,8 @@
                 "will not be shown.");
       }
    } else if (delta < 0) {
-      VG_TRACK( new_mem_stack, new_SP, -delta );
+      VG_TRACK( new_mem_stack_w_ECU, new_SP, -delta, ecu );
+      VG_TRACK( new_mem_stack,       new_SP, -delta );
 
    } else if (delta > 0) {
       VG_TRACK( die_mem_stack, old_SP,  delta );
diff --git a/coregrind/m_syswrap/syswrap-amd64-linux.c b/coregrind/m_syswrap/syswrap-amd64-linux.c
index 06850b1..dd424d1 100644
--- a/coregrind/m_syswrap/syswrap-amd64-linux.c
+++ b/coregrind/m_syswrap/syswrap-amd64-linux.c
@@ -322,7 +322,8 @@
 {  
    /* We inherit our parent's guest state. */
    child->vex = parent->vex;
-   child->vex_shadow = parent->vex_shadow;
+   child->vex_shadow1 = parent->vex_shadow1;
+   child->vex_shadow2 = parent->vex_shadow2;
 }  
 
 
diff --git a/coregrind/m_syswrap/syswrap-generic.c b/coregrind/m_syswrap/syswrap-generic.c
index d7e0755..2a21568 100644
--- a/coregrind/m_syswrap/syswrap-generic.c
+++ b/coregrind/m_syswrap/syswrap-generic.c
@@ -2629,7 +2629,7 @@
       if (brk_new > brk_limit) {
          /* successfully grew the data segment */
          VG_TRACK( new_mem_brk, brk_limit,
-                                ARG1-brk_limit );
+                   ARG1-brk_limit, tid );
       }
    } else {
       /* brk() failed */
diff --git a/coregrind/m_syswrap/syswrap-ppc32-linux.c b/coregrind/m_syswrap/syswrap-ppc32-linux.c
index 8e19e1a..d5ce8ee 100644
--- a/coregrind/m_syswrap/syswrap-ppc32-linux.c
+++ b/coregrind/m_syswrap/syswrap-ppc32-linux.c
@@ -375,7 +375,8 @@
 {
    /* We inherit our parent's guest state. */
    child->vex = parent->vex;
-   child->vex_shadow = parent->vex_shadow;
+   child->vex_shadow1 = parent->vex_shadow1;
+   child->vex_shadow2 = parent->vex_shadow2;
 }
 
 
diff --git a/coregrind/m_syswrap/syswrap-ppc64-linux.c b/coregrind/m_syswrap/syswrap-ppc64-linux.c
index c6aa504..3111a2e 100644
--- a/coregrind/m_syswrap/syswrap-ppc64-linux.c
+++ b/coregrind/m_syswrap/syswrap-ppc64-linux.c
@@ -405,7 +405,8 @@
 {
    /* We inherit our parent's guest state. */
    child->vex = parent->vex;
-   child->vex_shadow = parent->vex_shadow;
+   child->vex_shadow1 = parent->vex_shadow1;
+   child->vex_shadow2 = parent->vex_shadow2;
 }
 
 
diff --git a/coregrind/m_syswrap/syswrap-x86-linux.c b/coregrind/m_syswrap/syswrap-x86-linux.c
index 3b69310..cb991f7 100644
--- a/coregrind/m_syswrap/syswrap-x86-linux.c
+++ b/coregrind/m_syswrap/syswrap-x86-linux.c
@@ -732,7 +732,8 @@
 {
    /* We inherit our parent's guest state. */
    child->vex = parent->vex;
-   child->vex_shadow = parent->vex_shadow;
+   child->vex_shadow1 = parent->vex_shadow1;
+   child->vex_shadow2 = parent->vex_shadow2;
 
    /* We inherit our parent's LDT. */
    if (parent->vex.guest_LDT == (HWord)NULL) {
diff --git a/coregrind/m_tooliface.c b/coregrind/m_tooliface.c
index a0aa2fd..bd0af0a 100644
--- a/coregrind/m_tooliface.c
+++ b/coregrind/m_tooliface.c
@@ -100,9 +100,13 @@
 /* static */
 Bool VG_(sanity_check_needs)(Char** failmsg)
 {
+   Bool any_new_mem_stack_N, any_new_mem_stack_N_w_ECU;
+   Bool any_new_mem_stack_w_conflicting_otags;
+   Bool any_die_mem_stack_N;
+
 #define CHECK_NOT(var, value)                                  \
    if ((var)==(value)) {                                       \
-      *failmsg = "Tool error: '" #var "' not initialised\n"; \
+      *failmsg = "Tool error: '" #var "' not initialised\n";   \
       return False;                                            \
    }
    
@@ -113,35 +117,82 @@
    CHECK_NOT(VG_(details).copyright_author, NULL);
    CHECK_NOT(VG_(details).bug_reports_to,   NULL);
 
-   if ( (VG_(tdict).track_new_mem_stack_4   ||
-         VG_(tdict).track_new_mem_stack_8   ||
-         VG_(tdict).track_new_mem_stack_12  ||
-         VG_(tdict).track_new_mem_stack_16  ||
-         VG_(tdict).track_new_mem_stack_32  ||
-         VG_(tdict).track_new_mem_stack_112 ||
-         VG_(tdict).track_new_mem_stack_128 ||
-         VG_(tdict).track_new_mem_stack_144 ||
-         VG_(tdict).track_new_mem_stack_160 ) &&
-       ! VG_(tdict).track_new_mem_stack) 
-   {
-      *failmsg = "Tool error: one of the specialised 'new_mem_stack_n'\n"
+   /* Check that new_mem_stack is defined if any new_mem_stack_N
+      are. */
+   any_new_mem_stack_N 
+      = VG_(tdict).track_new_mem_stack_4   ||
+        VG_(tdict).track_new_mem_stack_8   ||
+        VG_(tdict).track_new_mem_stack_12  ||
+        VG_(tdict).track_new_mem_stack_16  ||
+        VG_(tdict).track_new_mem_stack_32  ||
+        VG_(tdict).track_new_mem_stack_112 ||
+        VG_(tdict).track_new_mem_stack_128 ||
+        VG_(tdict).track_new_mem_stack_144 ||
+        VG_(tdict).track_new_mem_stack_160;
+
+   if (any_new_mem_stack_N && ! VG_(tdict).track_new_mem_stack) {
+      *failmsg = "Tool error: one of the specialised 'new_mem_stack_N'\n"
                  "   events tracked, but not the generic 'new_mem_stack' one.\n"
                  "   'new_mem_stack' should be defined\n";
       return False;
    }
 
-   if ( (VG_(tdict).track_die_mem_stack_4   ||
-         VG_(tdict).track_die_mem_stack_8   ||
-         VG_(tdict).track_die_mem_stack_12  ||
-         VG_(tdict).track_die_mem_stack_16  ||
-         VG_(tdict).track_die_mem_stack_32  ||
-         VG_(tdict).track_die_mem_stack_112 ||
-         VG_(tdict).track_die_mem_stack_128 ||
-         VG_(tdict).track_die_mem_stack_144 ||
-         VG_(tdict).track_die_mem_stack_160 ) &&
-       ! VG_(tdict).track_die_mem_stack) 
-   {
-      *failmsg = "Tool error: one of the specialised 'die_mem_stack_n'\n"
+   /* Check that new_mem_stack_w_ECU is defined if any
+      new_mem_stack_N_w_ECU are. */
+   any_new_mem_stack_N_w_ECU
+      = VG_(tdict).track_new_mem_stack_4_w_ECU   ||
+        VG_(tdict).track_new_mem_stack_8_w_ECU   ||
+        VG_(tdict).track_new_mem_stack_12_w_ECU  ||
+        VG_(tdict).track_new_mem_stack_16_w_ECU  ||
+        VG_(tdict).track_new_mem_stack_32_w_ECU  ||
+        VG_(tdict).track_new_mem_stack_112_w_ECU ||
+        VG_(tdict).track_new_mem_stack_128_w_ECU ||
+        VG_(tdict).track_new_mem_stack_144_w_ECU ||
+        VG_(tdict).track_new_mem_stack_160_w_ECU;
+
+   if (any_new_mem_stack_N_w_ECU && ! VG_(tdict).track_new_mem_stack_w_ECU) {
+      *failmsg = "Tool error: one of the specialised 'new_mem_stack_N_w_ECU'\n"
+                 "   events tracked, but not the generic 'new_mem_stack_w_ECU' one.\n"
+                 "   'new_mem_stack_w_ECU' should be defined\n";
+      return False;
+   }
+
+   /* Check that in no cases are both with- and without-otag versions of the
+      same new_mem_stack_ function defined. */
+   any_new_mem_stack_w_conflicting_otags
+      = (VG_(tdict).track_new_mem_stack_4   && VG_(tdict).track_new_mem_stack_4_w_ECU)   ||
+        (VG_(tdict).track_new_mem_stack_8   && VG_(tdict).track_new_mem_stack_8_w_ECU)   ||
+        (VG_(tdict).track_new_mem_stack_12  && VG_(tdict).track_new_mem_stack_12_w_ECU)  ||
+        (VG_(tdict).track_new_mem_stack_16  && VG_(tdict).track_new_mem_stack_16_w_ECU)  ||
+        (VG_(tdict).track_new_mem_stack_32  && VG_(tdict).track_new_mem_stack_32_w_ECU)  ||
+        (VG_(tdict).track_new_mem_stack_112 && VG_(tdict).track_new_mem_stack_112_w_ECU) ||
+        (VG_(tdict).track_new_mem_stack_128 && VG_(tdict).track_new_mem_stack_128_w_ECU) ||
+        (VG_(tdict).track_new_mem_stack_144 && VG_(tdict).track_new_mem_stack_144_w_ECU) ||
+        (VG_(tdict).track_new_mem_stack_160 && VG_(tdict).track_new_mem_stack_160_w_ECU) ||
+        (VG_(tdict).track_new_mem_stack     && VG_(tdict).track_new_mem_stack_w_ECU);
+
+   if (any_new_mem_stack_w_conflicting_otags) {
+      *failmsg = "Tool error: tool supplies both a 'new_mem_stack_N' and a\n"
+                 "   'new_mem_stack_N_w_ECU' function for some N (or none),\n"
+                 "   but you can only have one or the other (not both)\n";
+      return False;
+   }
+
+   /* Check that die_mem_stack is defined if any die_mem_stack_N
+      are. */
+   any_die_mem_stack_N
+      = VG_(tdict).track_die_mem_stack_4   ||
+        VG_(tdict).track_die_mem_stack_8   ||
+        VG_(tdict).track_die_mem_stack_12  ||
+        VG_(tdict).track_die_mem_stack_16  ||
+        VG_(tdict).track_die_mem_stack_32  ||
+        VG_(tdict).track_die_mem_stack_112 ||
+        VG_(tdict).track_die_mem_stack_128 ||
+        VG_(tdict).track_die_mem_stack_144 ||
+        VG_(tdict).track_die_mem_stack_160;
+
+    if (any_die_mem_stack_N && ! VG_(tdict).track_die_mem_stack) {
+      *failmsg = "Tool error: one of the specialised 'die_mem_stack_N'\n"
                  "   events tracked, but not the generic 'die_mem_stack' one.\n"
                  "   'die_mem_stack' should be defined\n";
       return False;
@@ -272,74 +323,88 @@
 }
 
 /*--------------------------------------------------------------------*/
-/* Tracked events */
+/* Tracked events.  Digit 'n' on DEFn is the REGPARMness. */
 
-#define DEF(fn, args...) \
-void VG_(fn)(void(*f)(args)) \
-{ \
+#define DEF0(fn, args...) \
+void VG_(fn)(void(*f)(args)) { \
+   VG_(tdict).fn = f; \
+}
+
+#define DEF1(fn, args...) \
+void VG_(fn)(VG_REGPARM(1) void(*f)(args)) { \
    VG_(tdict).fn = f; \
 }
 
 #define DEF2(fn, args...) \
-void VG_(fn)(VG_REGPARM(1) void(*f)(args)) \
-{ \
+void VG_(fn)(VG_REGPARM(2) void(*f)(args)) { \
    VG_(tdict).fn = f; \
 }
 
-DEF(track_new_mem_startup,       Addr, SizeT, Bool, Bool, Bool)
-DEF(track_new_mem_stack_signal,  Addr, SizeT)
-DEF(track_new_mem_brk,           Addr, SizeT)
-DEF(track_new_mem_mmap,          Addr, SizeT, Bool, Bool, Bool)
+DEF0(track_new_mem_startup,       Addr, SizeT, Bool, Bool, Bool)
+DEF0(track_new_mem_stack_signal,  Addr, SizeT, UInt)
+DEF0(track_new_mem_brk,           Addr, SizeT, UInt)
+DEF0(track_new_mem_mmap,          Addr, SizeT, Bool, Bool, Bool)
 
-DEF(track_copy_mem_remap,        Addr, Addr, SizeT)
-DEF(track_change_mem_mprotect,   Addr, SizeT, Bool, Bool, Bool)
-DEF(track_die_mem_stack_signal,  Addr, SizeT)
-DEF(track_die_mem_brk,           Addr, SizeT)
-DEF(track_die_mem_munmap,        Addr, SizeT)
+DEF0(track_copy_mem_remap,        Addr, Addr, SizeT)
+DEF0(track_change_mem_mprotect,   Addr, SizeT, Bool, Bool, Bool)
+DEF0(track_die_mem_stack_signal,  Addr, SizeT)
+DEF0(track_die_mem_brk,           Addr, SizeT)
+DEF0(track_die_mem_munmap,        Addr, SizeT)
 
-DEF2(track_new_mem_stack_4,      Addr)
-DEF2(track_new_mem_stack_8,      Addr)
-DEF2(track_new_mem_stack_12,     Addr)
-DEF2(track_new_mem_stack_16,     Addr)
-DEF2(track_new_mem_stack_32,     Addr)
-DEF2(track_new_mem_stack_112,    Addr)
-DEF2(track_new_mem_stack_128,    Addr)
-DEF2(track_new_mem_stack_144,    Addr)
-DEF2(track_new_mem_stack_160,    Addr)
-DEF (track_new_mem_stack,        Addr, SizeT)
+DEF2(track_new_mem_stack_4_w_ECU,    Addr, UInt)
+DEF2(track_new_mem_stack_8_w_ECU,    Addr, UInt)
+DEF2(track_new_mem_stack_12_w_ECU,   Addr, UInt)
+DEF2(track_new_mem_stack_16_w_ECU,   Addr, UInt)
+DEF2(track_new_mem_stack_32_w_ECU,   Addr, UInt)
+DEF2(track_new_mem_stack_112_w_ECU,  Addr, UInt)
+DEF2(track_new_mem_stack_128_w_ECU,  Addr, UInt)
+DEF2(track_new_mem_stack_144_w_ECU,  Addr, UInt)
+DEF2(track_new_mem_stack_160_w_ECU,  Addr, UInt)
+DEF0(track_new_mem_stack_w_ECU,      Addr, SizeT, UInt)
 
-DEF2(track_die_mem_stack_4,      Addr)
-DEF2(track_die_mem_stack_8,      Addr)
-DEF2(track_die_mem_stack_12,     Addr)
-DEF2(track_die_mem_stack_16,     Addr)
-DEF2(track_die_mem_stack_32,     Addr)
-DEF2(track_die_mem_stack_112,    Addr)
-DEF2(track_die_mem_stack_128,    Addr)
-DEF2(track_die_mem_stack_144,    Addr)
-DEF2(track_die_mem_stack_160,    Addr)
-DEF (track_die_mem_stack,        Addr, SizeT)
+DEF1(track_new_mem_stack_4,       Addr)
+DEF1(track_new_mem_stack_8,       Addr)
+DEF1(track_new_mem_stack_12,      Addr)
+DEF1(track_new_mem_stack_16,      Addr)
+DEF1(track_new_mem_stack_32,      Addr)
+DEF1(track_new_mem_stack_112,     Addr)
+DEF1(track_new_mem_stack_128,     Addr)
+DEF1(track_new_mem_stack_144,     Addr)
+DEF1(track_new_mem_stack_160,     Addr)
+DEF0(track_new_mem_stack,         Addr, SizeT)
 
-DEF(track_ban_mem_stack,         Addr, SizeT)
+DEF1(track_die_mem_stack_4,       Addr)
+DEF1(track_die_mem_stack_8,       Addr)
+DEF1(track_die_mem_stack_12,      Addr)
+DEF1(track_die_mem_stack_16,      Addr)
+DEF1(track_die_mem_stack_32,      Addr)
+DEF1(track_die_mem_stack_112,     Addr)
+DEF1(track_die_mem_stack_128,     Addr)
+DEF1(track_die_mem_stack_144,     Addr)
+DEF1(track_die_mem_stack_160,     Addr)
+DEF0(track_die_mem_stack,         Addr, SizeT)
 
-DEF(track_pre_mem_read,          CorePart, ThreadId, Char*, Addr, SizeT)
-DEF(track_pre_mem_read_asciiz,   CorePart, ThreadId, Char*, Addr)
-DEF(track_pre_mem_write,         CorePart, ThreadId, Char*, Addr, SizeT)
-DEF(track_post_mem_write,        CorePart, ThreadId, Addr, SizeT)
+DEF0(track_ban_mem_stack,         Addr, SizeT)
 
-DEF(track_pre_reg_read,          CorePart, ThreadId, Char*, OffT, SizeT)
-DEF(track_post_reg_write,        CorePart, ThreadId,        OffT, SizeT)
+DEF0(track_pre_mem_read,          CorePart, ThreadId, Char*, Addr, SizeT)
+DEF0(track_pre_mem_read_asciiz,   CorePart, ThreadId, Char*, Addr)
+DEF0(track_pre_mem_write,         CorePart, ThreadId, Char*, Addr, SizeT)
+DEF0(track_post_mem_write,        CorePart, ThreadId, Addr, SizeT)
 
-DEF(track_post_reg_write_clientcall_return, ThreadId, OffT, SizeT, Addr)
+DEF0(track_pre_reg_read,          CorePart, ThreadId, Char*, OffT, SizeT)
+DEF0(track_post_reg_write,        CorePart, ThreadId,        OffT, SizeT)
 
-DEF(track_start_client_code,     ThreadId, ULong)
-DEF(track_stop_client_code,      ThreadId, ULong)
+DEF0(track_post_reg_write_clientcall_return, ThreadId, OffT, SizeT, Addr)
 
-DEF(track_pre_thread_ll_create,  ThreadId, ThreadId)
-DEF(track_pre_thread_first_insn, ThreadId)
-DEF(track_pre_thread_ll_exit,    ThreadId)
+DEF0(track_start_client_code,     ThreadId, ULong)
+DEF0(track_stop_client_code,      ThreadId, ULong)
 
-DEF(track_pre_deliver_signal,    ThreadId, Int sigNo, Bool)
-DEF(track_post_deliver_signal,   ThreadId, Int sigNo)
+DEF0(track_pre_thread_ll_create,  ThreadId, ThreadId)
+DEF0(track_pre_thread_first_insn, ThreadId)
+DEF0(track_pre_thread_ll_exit,    ThreadId)
+
+DEF0(track_pre_deliver_signal,    ThreadId, Int sigNo, Bool)
+DEF0(track_post_deliver_signal,   ThreadId, Int sigNo)
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
diff --git a/coregrind/m_translate.c b/coregrind/m_translate.c
index 5857dd9..35744cd 100644
--- a/coregrind/m_translate.c
+++ b/coregrind/m_translate.c
@@ -56,6 +56,8 @@
 #include "pub_core_threadstate.h"  // VexGuestArchState
 #include "pub_core_trampoline.h"   // VG_(ppctoc_magic_redirect_return_stub)
 
+#include "pub_core_execontext.h"  // VG_(make_depth_1_ExeContext_from_Addr)
+
 
 /*------------------------------------------------------------*/
 /*--- Stats                                                ---*/
@@ -191,6 +193,22 @@
    }
 }
 
+/* Given a guest IP, get an origin tag for a 1-element stack trace,
+   and wrap it up in an IR atom that can be passed as the origin-tag
+   value for a stack-adjustment helper function. */
+static IRExpr* mk_ecu_Expr ( Addr64 guest_IP )
+{
+   UInt ecu;
+   ExeContext* ec
+      = VG_(make_depth_1_ExeContext_from_Addr)( (Addr)guest_IP );
+   vg_assert(ec);
+   ecu = VG_(get_ECU_from_ExeContext)( ec );
+   vg_assert(VG_(is_plausible_ECU)(ecu));
+   /* This is always safe to do, since ecu is only 32 bits, and
+      HWord is 32 or 64. */
+   return mkIRExpr_HWord( (HWord)ecu );
+}
+
 
 /* For tools that want to know about SP changes, this pass adds
    in the appropriate hooks.  We have to do it after the tool's
@@ -227,6 +245,10 @@
    IRType      typeof_SP;
    Long        delta, con;
 
+   /* Set up stuff for tracking the guest IP */
+   Bool   curr_IP_known = False;
+   Addr64 curr_IP       = 0;
+
    /* Set up BB */
    IRSB* bb     = emptyIRSB();
    bb->tyenv    = deepCopyIRTypeEnv(sb_in->tyenv);
@@ -240,6 +262,8 @@
    typeof_SP = sizeof_SP==4 ? Ity_I32 : Ity_I64;
    vg_assert(sizeof_SP == 4 || sizeof_SP == 8);
 
+   /* --- Start of #defines --- */
+
 #  define IS_ADD(op) (sizeof_SP==4 ? ((op)==Iop_Add32) : ((op)==Iop_Add64))
 #  define IS_SUB(op) (sizeof_SP==4 ? ((op)==Iop_Sub32) : ((op)==Iop_Sub64))
 
@@ -249,19 +273,62 @@
        (sizeof_SP==4 ? (Long)(Int)(con->Ico.U32)                        \
                      : (Long)(con->Ico.U64))
 
-// XXX: convert this to a function
-#  define DO(kind, syze, tmpp)                                          \
+#  define DO_NEW(syze, tmpp)                                            \
       do {                                                              \
-         if (!VG_(tdict).track_##kind##_mem_stack_##syze)               \
+         Bool vanilla, w_ecu;                                           \
+         vg_assert(curr_IP_known);                                      \
+         vanilla = NULL != VG_(tdict).track_new_mem_stack_##syze;       \
+         w_ecu   = NULL != VG_(tdict).track_new_mem_stack_##syze##_w_ECU; \
+         vg_assert(!(vanilla && w_ecu)); /* can't have both */          \
+         if (!(vanilla || w_ecu))                                       \
+            goto generic;                                               \
+                                                                        \
+         /* I don't know if it's really necessary to say that the */    \
+         /* call reads the stack pointer.  But anyway, we do. */        \
+         if (w_ecu) {                                                   \
+            dcall = unsafeIRDirty_0_N(                                  \
+                       2/*regparms*/,                                   \
+                       "track_new_mem_stack_" #syze "_w_ECU",           \
+                       VG_(fnptr_to_fnentry)(                           \
+                          VG_(tdict).track_new_mem_stack_##syze##_w_ECU ), \
+                       mkIRExprVec_2(IRExpr_RdTmp(tmpp),                \
+                                     mk_ecu_Expr(curr_IP))              \
+                    );                                                  \
+         } else {                                                       \
+            dcall = unsafeIRDirty_0_N(                                  \
+                       1/*regparms*/,                                   \
+                       "track_new_mem_stack_" #syze ,                   \
+                       VG_(fnptr_to_fnentry)(                           \
+                          VG_(tdict).track_new_mem_stack_##syze ),      \
+                       mkIRExprVec_1(IRExpr_RdTmp(tmpp))                \
+                    );                                                  \
+         }                                                              \
+         dcall->nFxState = 1;                                           \
+         dcall->fxState[0].fx     = Ifx_Read;                           \
+         dcall->fxState[0].offset = layout->offset_SP;                  \
+         dcall->fxState[0].size   = layout->sizeof_SP;                  \
+                                                                        \
+         addStmtToIRSB( bb, IRStmt_Dirty(dcall) );                      \
+                                                                        \
+         tl_assert(syze > 0);                                           \
+         update_SP_aliases(syze);                                       \
+                                                                        \
+         n_SP_updates_fast++;                                           \
+                                                                        \
+      } while (0)
+
+#  define DO_DIE(syze, tmpp)                                            \
+      do {                                                              \
+         if (!VG_(tdict).track_die_mem_stack_##syze)                    \
             goto generic;                                               \
                                                                         \
          /* I don't know if it's really necessary to say that the */    \
          /* call reads the stack pointer.  But anyway, we do. */        \
          dcall = unsafeIRDirty_0_N(                                     \
                     1/*regparms*/,                                      \
-                    "track_" #kind "_mem_stack_" #syze,                 \
+                    "track_die_mem_stack_" #syze,                       \
                     VG_(fnptr_to_fnentry)(                              \
-                       VG_(tdict).track_##kind##_mem_stack_##syze ),    \
+                       VG_(tdict).track_die_mem_stack_##syze ),         \
                     mkIRExprVec_1(IRExpr_RdTmp(tmpp))                   \
                  );                                                     \
          dcall->nFxState = 1;                                           \
@@ -271,18 +338,26 @@
                                                                         \
          addStmtToIRSB( bb, IRStmt_Dirty(dcall) );                      \
                                                                         \
-         update_SP_aliases(-delta);                                     \
+         tl_assert(syze > 0);                                           \
+         update_SP_aliases(-(syze));                                    \
                                                                         \
          n_SP_updates_fast++;                                           \
                                                                         \
       } while (0)
 
+   /* --- End of #defines --- */
+
    clear_SP_aliases();
 
    for (i = 0; i <  sb_in->stmts_used; i++) {
 
       st = sb_in->stmts[i];
 
+      if (st->tag == Ist_IMark) {
+         curr_IP_known = True;
+         curr_IP       = st->Ist.IMark.addr;
+      }
+
       /* t = Get(sp):   curr = t, delta = 0 */
       if (st->tag != Ist_WrTmp) goto case2;
       e = st->Ist.WrTmp.data;
@@ -359,24 +434,24 @@
          vg_assert(last_SP == last_Put);
          switch (delta) {
             case    0:                      addStmtToIRSB(bb,st); continue;
-            case    4: DO(die,  4,  tttmp); addStmtToIRSB(bb,st); continue;
-            case   -4: DO(new,  4,  tttmp); addStmtToIRSB(bb,st); continue;
-            case    8: DO(die,  8,  tttmp); addStmtToIRSB(bb,st); continue;
-            case   -8: DO(new,  8,  tttmp); addStmtToIRSB(bb,st); continue;
-            case   12: DO(die,  12, tttmp); addStmtToIRSB(bb,st); continue;
-            case  -12: DO(new,  12, tttmp); addStmtToIRSB(bb,st); continue;
-            case   16: DO(die,  16, tttmp); addStmtToIRSB(bb,st); continue;
-            case  -16: DO(new,  16, tttmp); addStmtToIRSB(bb,st); continue;
-            case   32: DO(die,  32, tttmp); addStmtToIRSB(bb,st); continue;
-            case  -32: DO(new,  32, tttmp); addStmtToIRSB(bb,st); continue;
-            case  112: DO(die, 112, tttmp); addStmtToIRSB(bb,st); continue;
-            case -112: DO(new, 112, tttmp); addStmtToIRSB(bb,st); continue;
-            case  128: DO(die, 128, tttmp); addStmtToIRSB(bb,st); continue;
-            case -128: DO(new, 128, tttmp); addStmtToIRSB(bb,st); continue;
-            case  144: DO(die, 144, tttmp); addStmtToIRSB(bb,st); continue;
-            case -144: DO(new, 144, tttmp); addStmtToIRSB(bb,st); continue;
-            case  160: DO(die, 160, tttmp); addStmtToIRSB(bb,st); continue;
-            case -160: DO(new, 160, tttmp); addStmtToIRSB(bb,st); continue;
+            case    4: DO_DIE(  4,  tttmp); addStmtToIRSB(bb,st); continue;
+            case   -4: DO_NEW(  4,  tttmp); addStmtToIRSB(bb,st); continue;
+            case    8: DO_DIE(  8,  tttmp); addStmtToIRSB(bb,st); continue;
+            case   -8: DO_NEW(  8,  tttmp); addStmtToIRSB(bb,st); continue;
+            case   12: DO_DIE(  12, tttmp); addStmtToIRSB(bb,st); continue;
+            case  -12: DO_NEW(  12, tttmp); addStmtToIRSB(bb,st); continue;
+            case   16: DO_DIE(  16, tttmp); addStmtToIRSB(bb,st); continue;
+            case  -16: DO_NEW(  16, tttmp); addStmtToIRSB(bb,st); continue;
+            case   32: DO_DIE(  32, tttmp); addStmtToIRSB(bb,st); continue;
+            case  -32: DO_NEW(  32, tttmp); addStmtToIRSB(bb,st); continue;
+            case  112: DO_DIE( 112, tttmp); addStmtToIRSB(bb,st); continue;
+            case -112: DO_NEW( 112, tttmp); addStmtToIRSB(bb,st); continue;
+            case  128: DO_DIE( 128, tttmp); addStmtToIRSB(bb,st); continue;
+            case -128: DO_NEW( 128, tttmp); addStmtToIRSB(bb,st); continue;
+            case  144: DO_DIE( 144, tttmp); addStmtToIRSB(bb,st); continue;
+            case -144: DO_NEW( 144, tttmp); addStmtToIRSB(bb,st); continue;
+            case  160: DO_DIE( 160, tttmp); addStmtToIRSB(bb,st); continue;
+            case -160: DO_NEW( 160, tttmp); addStmtToIRSB(bb,st); continue;
             default:  
                /* common values for ppc64: 144 128 160 112 176 */
                n_SP_updates_generic_known++;
@@ -395,14 +470,15 @@
                 we must assume it can be anything allowed in flat IR (tmp
                 or const).
          */
-         IRTemp old_SP;
+         IRTemp  old_SP;
          n_SP_updates_generic_unknown++;
 
          // Nb: if all is well, this generic case will typically be
          // called something like every 1000th SP update.  If it's more than
          // that, the above code may be missing some cases.
         generic:
-         /* Pass both the old and new SP values to this helper. */
+         /* Pass both the old and new SP values to this helper.  Also,
+            pass an origin tag, even if it isn't needed. */
          old_SP = newIRTemp(bb->tyenv, typeof_SP);
          addStmtToIRSB( 
             bb,
@@ -414,11 +490,13 @@
          if (first_Put == first_SP && last_Put == last_SP) {
            /* The common case, an exact write to SP.  So st->Ist.Put.data
               does hold the new value; simple. */
+            vg_assert(curr_IP_known);
             dcall = unsafeIRDirty_0_N( 
-                       2/*regparms*/, 
+                       3/*regparms*/, 
                        "VG_(unknown_SP_update)", 
                        VG_(fnptr_to_fnentry)( &VG_(unknown_SP_update) ),
-                       mkIRExprVec_2( IRExpr_RdTmp(old_SP), st->Ist.Put.data ) 
+                       mkIRExprVec_3( IRExpr_RdTmp(old_SP), st->Ist.Put.data,
+                                      mk_ecu_Expr(curr_IP) ) 
                     );
             addStmtToIRSB( bb, IRStmt_Dirty(dcall) );
             /* don't forget the original assignment */
@@ -447,12 +525,14 @@
             /* 3 */
             addStmtToIRSB( bb, IRStmt_Put(offset_SP, IRExpr_RdTmp(old_SP) ));
             /* 4 */
+            vg_assert(curr_IP_known);
             dcall = unsafeIRDirty_0_N( 
-                       2/*regparms*/, 
+                       3/*regparms*/, 
                        "VG_(unknown_SP_update)", 
                        VG_(fnptr_to_fnentry)( &VG_(unknown_SP_update) ),
-                       mkIRExprVec_2( IRExpr_RdTmp(old_SP),
-                                      IRExpr_RdTmp(new_SP))
+                       mkIRExprVec_3( IRExpr_RdTmp(old_SP),
+                                      IRExpr_RdTmp(new_SP), 
+                                      mk_ecu_Expr(curr_IP) )
                     );
             addStmtToIRSB( bb, IRStmt_Dirty(dcall) );
             /* 5 */
@@ -510,6 +590,12 @@
   complain:
    VG_(core_panic)("vg_SP_update_pass: PutI or Dirty which overlaps SP");
 
+#undef IS_ADD
+#undef IS_SUB
+#undef IS_ADD_OR_SUB
+#undef GET_CONST
+#undef DO_NEW
+#undef DO_DIE
 }
 
 /*------------------------------------------------------------*/
diff --git a/coregrind/pub_core_aspacemgr.h b/coregrind/pub_core_aspacemgr.h
index c7a0675..b6ae45e 100644
--- a/coregrind/pub_core_aspacemgr.h
+++ b/coregrind/pub_core_aspacemgr.h
@@ -103,7 +103,9 @@
 /* Get the filename corresponding to this segment, if known and if it
    has one.  The returned name's storage cannot be assumed to be
    persistent, so the caller should immediately copy the name
-   elsewhere. */
+   elsewhere.  This may return NULL if the file name is not known or
+   for arbitrary other implementation-dependent reasons, so callers
+   need to be able to handle a NULL return value. */
 // Is in tool-visible header file.
 // extern HChar* VG_(am_get_filename)( NSegment* );
 
diff --git a/coregrind/pub_core_execontext.h b/coregrind/pub_core_execontext.h
index b2d0cf1..86eb9eb 100644
--- a/coregrind/pub_core_execontext.h
+++ b/coregrind/pub_core_execontext.h
@@ -50,7 +50,9 @@
 // (Minor hack: we use Addr* as the return type instead of StackTrace so
 // that modules #including this file don't also have to #include
 // pub_core_stacktrace.h also.)
-extern /*StackTrace*/Addr* VG_(extract_StackTrace) ( ExeContext* e );
+extern
+/*StackTrace*/Addr* VG_(get_ExeContext_StackTrace) ( ExeContext* e );
+
 
 #endif   // __PUB_CORE_EXECONTEXT_H
 
diff --git a/coregrind/pub_core_stacks.h b/coregrind/pub_core_stacks.h
index 07875fb..bdf05bb 100644
--- a/coregrind/pub_core_stacks.h
+++ b/coregrind/pub_core_stacks.h
@@ -41,8 +41,8 @@
 extern void  VG_(change_stack)     ( UWord id, Addr start, Addr end );
 extern void  VG_(stack_limits)     ( Addr SP, Addr *start, Addr *end );
 
-extern VG_REGPARM(2)
-       void  VG_(unknown_SP_update) ( Addr old_SP, Addr new_SP );
+extern VG_REGPARM(3)
+       void VG_(unknown_SP_update) ( Addr old_SP, Addr new_SP, UInt otag );
 
 #endif   // __PUB_CORE_STACKS_H
 
diff --git a/coregrind/pub_core_threadstate.h b/coregrind/pub_core_threadstate.h
index 4c6cbfe..9ffcc7c 100644
--- a/coregrind/pub_core_threadstate.h
+++ b/coregrind/pub_core_threadstate.h
@@ -93,14 +93,21 @@
    struct {
       /* --- BEGIN vex-mandated guest state --- */
 
-      /* Saved machine context. */
-      VexGuestArchState vex;
+      /* Note that for code generation reasons, we require that the
+         guest state area, its two shadows, and the spill area, are
+         16-aligned and have 16-aligned sizes, and there are no holes
+         in between.  This is checked by do_pre_run_checks() in
+         scheduler.c. */
 
-      /* Saved shadow context. */
-      VexGuestArchState vex_shadow;
+      /* Saved machine context. */
+      VexGuestArchState vex __attribute__((aligned(16)));
+
+      /* Saved shadow context (2 copies). */
+      VexGuestArchState vex_shadow1 __attribute__((aligned(16)));
+      VexGuestArchState vex_shadow2 __attribute__((aligned(16)));
 
       /* Spill area. */
-      UChar vex_spill[LibVEX_N_SPILL_BYTES];
+      UChar vex_spill[LibVEX_N_SPILL_BYTES] __attribute__((aligned(16)));
 
       /* --- END vex-mandated guest state --- */
    } 
diff --git a/coregrind/pub_core_tooliface.h b/coregrind/pub_core_tooliface.h
index aee88d3..31ac7d6 100644
--- a/coregrind/pub_core_tooliface.h
+++ b/coregrind/pub_core_tooliface.h
@@ -162,8 +162,8 @@
 
    // -- Event tracking functions ------------------------------------
    void (*track_new_mem_startup)     (Addr, SizeT, Bool, Bool, Bool);
-   void (*track_new_mem_stack_signal)(Addr, SizeT);
-   void (*track_new_mem_brk)         (Addr, SizeT);
+   void (*track_new_mem_stack_signal)(Addr, SizeT, ThreadId);
+   void (*track_new_mem_brk)         (Addr, SizeT, ThreadId);
    void (*track_new_mem_mmap)        (Addr, SizeT, Bool, Bool, Bool);
 
    void (*track_copy_mem_remap)      (Addr src, Addr dst, SizeT);
@@ -172,6 +172,17 @@
    void (*track_die_mem_brk)         (Addr, SizeT);
    void (*track_die_mem_munmap)      (Addr, SizeT);
 
+   void VG_REGPARM(2) (*track_new_mem_stack_4_w_ECU)  (Addr,UInt);
+   void VG_REGPARM(2) (*track_new_mem_stack_8_w_ECU)  (Addr,UInt);
+   void VG_REGPARM(2) (*track_new_mem_stack_12_w_ECU) (Addr,UInt);
+   void VG_REGPARM(2) (*track_new_mem_stack_16_w_ECU) (Addr,UInt);
+   void VG_REGPARM(2) (*track_new_mem_stack_32_w_ECU) (Addr,UInt);
+   void VG_REGPARM(2) (*track_new_mem_stack_112_w_ECU)(Addr,UInt);
+   void VG_REGPARM(2) (*track_new_mem_stack_128_w_ECU)(Addr,UInt);
+   void VG_REGPARM(2) (*track_new_mem_stack_144_w_ECU)(Addr,UInt);
+   void VG_REGPARM(2) (*track_new_mem_stack_160_w_ECU)(Addr,UInt);
+   void (*track_new_mem_stack_w_ECU)(Addr,SizeT,UInt);
+
    void VG_REGPARM(1) (*track_new_mem_stack_4)  (Addr);
    void VG_REGPARM(1) (*track_new_mem_stack_8)  (Addr);
    void VG_REGPARM(1) (*track_new_mem_stack_12) (Addr);
@@ -181,7 +192,7 @@
    void VG_REGPARM(1) (*track_new_mem_stack_128)(Addr);
    void VG_REGPARM(1) (*track_new_mem_stack_144)(Addr);
    void VG_REGPARM(1) (*track_new_mem_stack_160)(Addr);
-   void (*track_new_mem_stack)(Addr, SizeT);
+   void (*track_new_mem_stack)(Addr,SizeT);
 
    void VG_REGPARM(1) (*track_die_mem_stack_4)  (Addr);
    void VG_REGPARM(1) (*track_die_mem_stack_8)  (Addr);
diff --git a/docs/internals/xml-output.txt b/docs/internals/xml-output.txt
index 7b5d502..2ad21c3 100644
--- a/docs/internals/xml-output.txt
+++ b/docs/internals/xml-output.txt
@@ -107,12 +107,13 @@
   </valgrindoutput>
 
 Valgrind versions 3.0.0 and 3.0.1 emit protocol version 1.  Versions
-3.1.X and 3.2.X emit protocol version 2.
+3.1.X and 3.2.X emit protocol version 2.  3.4.X emits protocol version
+3.
 
 
 PROTOCOL for version 3
 ----------------------
-Changes in 3.3.X (tentative): (jrs, 1 March 2008)
+Changes in 3.4.X (tentative): (jrs, 1 March 2008)
 
 * There may be more than one <logfilequalifier> clause, depending on
   how this pans out.  (AshleyP perhaps to investigate)
@@ -120,6 +121,10 @@
 * Some errors may have two <auxwhat> blocks, rather than just one
   (resulting from merge of the DATASYMS branch)
 
+* Some errors may have an ORIGIN component, indicating the origins of
+  uninitialised values.  This results from the merge of the
+  OTRACK_BY_INSTRUMENTATION branch.
+
 
 PROTOCOL for version 2
 ----------------------
@@ -231,6 +236,7 @@
 
      optionally: <auxwhat>TEXT</auxwhat>
      optionally: STACK
+     optionally: ORIGIN
 
   </error>
 
@@ -376,6 +382,19 @@
 * line: gives the line number in the source file
 
 
+ORIGIN
+------
+ORIGIN shows the origin of uninitialised data in errors that involve
+uninitialised data.  STACK shows the origin of the uninitialised
+value.  TEXT gives a human-understandable hint as to the meaning of
+the information in STACK.
+
+   <origin>
+      <what>TEXT<what>
+      STACK
+   </origin>
+
+
 ERRORCOUNTS
 -----------
 This specifies, for each error that has been so far presented,
diff --git a/exp-drd/drd_main.c b/exp-drd/drd_main.c
index 3b9563a..bdb13b5 100644
--- a/exp-drd/drd_main.c
+++ b/exp-drd/drd_main.c
@@ -497,6 +497,20 @@
   }
 }
 
+static void drd_start_using_mem_w_ecu(const Addr a1,
+                                      const SizeT len,
+                                      UInt ec_uniq)
+{
+  drd_start_using_mem(a1, len);
+}
+
+static void drd_start_using_mem_w_tid(const Addr a1,
+                                      const SizeT len,
+                                      ThreadId tid)
+{
+  drd_start_using_mem(a1, len);
+}
+
 static __inline__
 void drd_stop_using_mem(const Addr a1, const SizeT len,
                         const Bool is_stack_mem)
@@ -538,7 +552,8 @@
 static void drd_start_using_mem_stack(const Addr a, const SizeT len)
 {
   thread_set_stack_min(thread_get_running_tid(), a - VG_STACK_REDZONE_SZB);
-  drd_start_using_mem(a - VG_STACK_REDZONE_SZB, len + VG_STACK_REDZONE_SZB);
+  drd_start_using_mem(a - VG_STACK_REDZONE_SZB, 
+                      len + VG_STACK_REDZONE_SZB);
 }
 
 /* Called by the core when the stack of a thread shrinks, to indicate that */
@@ -552,7 +567,9 @@
                      True);
 }
 
-static void drd_start_using_mem_stack_signal(const Addr a, const SizeT len)
+static void drd_start_using_mem_stack_signal(
+               const Addr a, const SizeT len,
+               ThreadId tid_for_whom_the_signal_frame_is_being_constructed)
 {
   thread_set_vg_running_tid(VG_(get_running_tid)());
   drd_start_using_mem(a, len);
@@ -945,6 +962,7 @@
     case Ist_IMark:
       instrument = VG_(seginfo_sect_kind)(NULL, 0, st->Ist.IMark.addr)
         != Vg_SectPLT;
+      addStmtToIRSB(bb, st);
       break;
 
     case Ist_MBE:
@@ -1125,7 +1143,7 @@
   VG_(track_pre_mem_read)         (drd_pre_mem_read);
   VG_(track_pre_mem_read_asciiz)  (drd_pre_mem_read_asciiz);
   VG_(track_post_mem_write)       (drd_post_mem_write);
-  VG_(track_new_mem_brk)          (drd_start_using_mem);
+  VG_(track_new_mem_brk)          (drd_start_using_mem_w_tid);
   VG_(track_new_mem_mmap)         (drd_start_using_mem_w_perms);
   VG_(track_new_mem_stack)        (drd_start_using_mem_stack);
   VG_(track_new_mem_stack_signal) (drd_start_using_mem_stack_signal);
@@ -1140,7 +1158,7 @@
   VG_(track_pre_thread_ll_exit)   (drd_thread_finished);
 
   // Other stuff.
-  drd_register_malloc_wrappers(drd_start_using_mem,
+  drd_register_malloc_wrappers(drd_start_using_mem_w_ecu,
                                drd_stop_using_nonstack_mem);
 
   drd_clientreq_init();
diff --git a/exp-drd/drd_malloc_wrappers.c b/exp-drd/drd_malloc_wrappers.c
index 43f71b4..e9cbbf8 100644
--- a/exp-drd/drd_malloc_wrappers.c
+++ b/exp-drd/drd_malloc_wrappers.c
@@ -99,7 +99,7 @@
     return NULL;
   }
   if (is_zeroed) VG_(memset)((void*)p, 0, size);
-  s_start_using_mem_callback(p, p + size);
+  s_start_using_mem_callback(p, p + size, 0/*ec_uniq*/);
 
   // Only update this stat if allocation succeeded.
   cmalloc_bs_mallocd += size;
@@ -210,7 +210,7 @@
 
       // Allocate a new chunk.
       mc = create_DRD_Chunk(tid, a_new, new_size);
-      s_start_using_mem_callback(a_new, a_new + new_size);
+      s_start_using_mem_callback(a_new, a_new + new_size, 0/*ec_uniq*/);
     }
     else
     {
diff --git a/exp-drd/drd_malloc_wrappers.h b/exp-drd/drd_malloc_wrappers.h
index ce3b93b..89d3b85 100644
--- a/exp-drd/drd_malloc_wrappers.h
+++ b/exp-drd/drd_malloc_wrappers.h
@@ -30,7 +30,7 @@
 #include "pub_tool_execontext.h" // ExeContext
 
 
-typedef void (*StartUsingMem)(const Addr a1, const Addr a2);
+typedef void (*StartUsingMem)(const Addr a1, const Addr a2, UInt ec_uniq);
 typedef void (*StopUsingMem)(const Addr a1, const Addr a2);
 
 
diff --git a/exp-drd/drd_pthread_intercepts.c b/exp-drd/drd_pthread_intercepts.c
index 94e9dd3..5b28d67 100644
--- a/exp-drd/drd_pthread_intercepts.c
+++ b/exp-drd/drd_pthread_intercepts.c
@@ -118,7 +118,9 @@
     /* PTHREAD_MUTEX_TIMED_NP */
     /* PTHREAD_MUTEX_NORMAL */
   case PTHREAD_MUTEX_DEFAULT:
+# if !defined(VGP_ppc32_aix5) && !defined(VGP_ppc64_aix5)
   case PTHREAD_MUTEX_ADAPTIVE_NP:
+# endif
     return mutex_type_default_mutex;
   }
   return mutex_type_invalid_mutex;
diff --git a/exp-drd/tests/recursive_mutex.c b/exp-drd/tests/recursive_mutex.c
index e5b5854..b066ea9 100644
--- a/exp-drd/tests/recursive_mutex.c
+++ b/exp-drd/tests/recursive_mutex.c
@@ -21,7 +21,7 @@
 {
   /* Let the program abort after 3 seconds instead of leaving it deadlocked. */
   alarm(3);
-
+#if !defined(_AIX)
   {
     pthread_mutex_t m = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
 
@@ -51,8 +51,8 @@
     pthread_mutex_init(&m, &attr);
     pthread_mutexattr_destroy(&attr);
     lock_twice(&m);
-    pthread_mutex_destroy(&m);
-  } 
+    pthread_mutex_destroy(&m); }
+#endif /* !defined(_AIX) */
   {
     pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
 
diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c
index 5f78aea..fff23eb 100644
--- a/helgrind/hg_main.c
+++ b/helgrind/hg_main.c
@@ -5425,6 +5425,15 @@
 }
 
 static
+void evh__new_mem_w_tid ( Addr a, SizeT len, ThreadId tid ) {
+   if (SHOW_EVENTS >= 2)
+      VG_(printf)("evh__new_mem_w_tid(%p, %lu)\n", (void*)a, len );
+   shadow_mem_make_New( get_current_Thread(), a, len );
+   if (len >= SCE_BIGRANGE_T && (clo_sanity_flags & SCE_BIGRANGE))
+      all__sanity_check("evh__new_mem_w_tid-post");
+}
+
+static
 void evh__new_mem_w_perms ( Addr a, SizeT len, 
                             Bool rr, Bool ww, Bool xx ) {
    if (SHOW_EVENTS >= 1)
@@ -8821,8 +8830,8 @@
    //VG_(needs_xml_output)          ();
 
    VG_(track_new_mem_startup)     ( evh__new_mem_w_perms );
-   VG_(track_new_mem_stack_signal)( evh__die_mem );
-   VG_(track_new_mem_brk)         ( evh__new_mem );
+   VG_(track_new_mem_stack_signal)( evh__new_mem_w_tid );
+   VG_(track_new_mem_brk)         ( evh__new_mem_w_tid );
    VG_(track_new_mem_mmap)        ( evh__new_mem_w_perms );
    VG_(track_new_mem_stack)       ( evh__new_mem );
 
diff --git a/include/pub_tool_execontext.h b/include/pub_tool_execontext.h
index da1bced..164615c 100644
--- a/include/pub_tool_execontext.h
+++ b/include/pub_tool_execontext.h
@@ -81,6 +81,30 @@
 // Print an ExeContext.
 extern void VG_(pp_ExeContext) ( ExeContext* ec );
 
+// Get the 32-bit unique reference number for this ExeContext
+// (the "ExeContext Unique").  Guaranteed to be nonzero and to be a
+// multiple of four (iow, the lowest two bits are guaranteed to
+// be zero, so that callers can store other information there.
+extern UInt VG_(get_ECU_from_ExeContext)( ExeContext* e );
+
+// How many entries (frames) in this ExeContext?
+extern Int VG_(get_ExeContext_n_ips)( ExeContext* e );
+
+// Find the ExeContext that has the given ECU, if any.
+// NOTE: very slow.  Do not call often.
+extern ExeContext* VG_(get_ExeContext_from_ECU)( UInt uniq );
+
+// Make an ExeContext containing just 'a', and nothing else
+ExeContext* VG_(make_depth_1_ExeContext_from_Addr)( Addr a );
+
+// Is this a plausible-looking ECU ?  Catches some obvious stupid
+// cases, but does not guarantee that the ECU is really valid, that
+// is, has an associated ExeContext.
+static inline Bool VG_(is_plausible_ECU)( UInt ecu ) {
+   return (ecu > 0) && ((ecu & 3) == 0);
+}
+
+
 #endif   // __PUB_TOOL_EXECONTEXT_H
 
 /*--------------------------------------------------------------------*/
diff --git a/include/pub_tool_machine.h b/include/pub_tool_machine.h
index bb9a4cc..6775bc1 100644
--- a/include/pub_tool_machine.h
+++ b/include/pub_tool_machine.h
@@ -83,10 +83,14 @@
 
 // For get/set, 'area' is where the asked-for shadow state will be copied
 // into/from.
-extern void VG_(get_shadow_regs_area) ( ThreadId tid, OffT guest_state_offset,
-                                        SizeT size, UChar* area );
-extern void VG_(set_shadow_regs_area) ( ThreadId tid, OffT guest_state_offset,
-                                        SizeT size, const UChar* area );
+void
+VG_(get_shadow_regs_area) ( ThreadId tid, 
+                            /*DST*/UChar* dst,
+                            /*SRC*/Int shadowNo, OffT offset, SizeT size );
+void
+VG_(set_shadow_regs_area) ( ThreadId tid, 
+                            /*DST*/Int shadowNo, OffT offset, SizeT size,
+                            /*SRC*/const UChar* src );
 
 // Apply a function 'f' to all the general purpose registers in all the
 // current threads.
diff --git a/include/pub_tool_tooliface.h b/include/pub_tool_tooliface.h
index f75c74a..7e94d4b 100644
--- a/include/pub_tool_tooliface.h
+++ b/include/pub_tool_tooliface.h
@@ -469,12 +469,15 @@
    Memory events (Nb: to track heap allocation/freeing, a tool must replace
    malloc() et al.  See above how to do this.)
 
-   These ones occur at startup, upon some signals, and upon some syscalls
- */
+   These ones occur at startup, upon some signals, and upon some syscalls.
+
+   For the new_mem_brk and new_mem_stack_signal, the supplied ThreadId
+   indicates the thread for whom the new memory is being allocated.
+*/
 void VG_(track_new_mem_startup)     (void(*f)(Addr a, SizeT len,
                                               Bool rr, Bool ww, Bool xx));
-void VG_(track_new_mem_stack_signal)(void(*f)(Addr a, SizeT len));
-void VG_(track_new_mem_brk)         (void(*f)(Addr a, SizeT len));
+void VG_(track_new_mem_stack_signal)(void(*f)(Addr a, SizeT len, ThreadId tid));
+void VG_(track_new_mem_brk)         (void(*f)(Addr a, SizeT len, ThreadId tid));
 void VG_(track_new_mem_mmap)        (void(*f)(Addr a, SizeT len,
                                               Bool rr, Bool ww, Bool xx));
 
@@ -494,7 +497,29 @@
    specialised cases are defined, the general case must be defined too.
 
    Nb: all the specialised ones must use the VG_REGPARM(n) attribute.
- */
+
+   For the _new functions, a tool may specify with with-ECU
+   (ExeContext Unique) or without-ECU version for each size, but not
+   both.  If the with-ECU version is supplied, then the core will
+   arrange to pass, as the ecu argument, a 32-bit int which uniquely
+   identifies the instruction moving the stack pointer down.  This
+   32-bit value is as obtained from VG_(get_ECU_from_ExeContext).
+   VG_(get_ExeContext_from_ECU) can then be used to retrieve the
+   associated depth-1 ExeContext for the location.  All this
+   complexity is provided to support origin tracking in Memcheck.
+*/
+void VG_(track_new_mem_stack_4_w_ECU)  (VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_8_w_ECU)  (VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_12_w_ECU) (VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_16_w_ECU) (VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_32_w_ECU) (VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_112_w_ECU)(VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_128_w_ECU)(VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_144_w_ECU)(VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_160_w_ECU)(VG_REGPARM(2) void(*f)(Addr new_ESP, UInt ecu));
+void VG_(track_new_mem_stack_w_ECU)                  (void(*f)(Addr a, SizeT len,
+                                                                       UInt ecu));
+
 void VG_(track_new_mem_stack_4)  (VG_REGPARM(1) void(*f)(Addr new_ESP));
 void VG_(track_new_mem_stack_8)  (VG_REGPARM(1) void(*f)(Addr new_ESP));
 void VG_(track_new_mem_stack_12) (VG_REGPARM(1) void(*f)(Addr new_ESP));
diff --git a/massif/ms_main.c b/massif/ms_main.c
index ba56c30..8d62e82 100644
--- a/massif/ms_main.c
+++ b/massif/ms_main.c
@@ -1732,7 +1732,7 @@
    die_mem_stack_2(a, len, "stk-die");
 }
 
-static void new_mem_stack_signal(Addr a, SizeT len)
+static void new_mem_stack_signal(Addr a, SizeT len, ThreadId tid)
 {
    new_mem_stack_2(a, len, "sig-new");
 }
diff --git a/memcheck/Makefile.am b/memcheck/Makefile.am
index c8fa3a1..f1ec930 100644
--- a/memcheck/Makefile.am
+++ b/memcheck/Makefile.am
@@ -80,7 +80,8 @@
 	mc_leakcheck.c \
 	mc_malloc_wrappers.c \
 	mc_main.c \
-	mc_translate.c
+	mc_translate.c \
+	mc_machine.c
 
 memcheck_x86_linux_SOURCES      = $(MEMCHECK_SOURCES_COMMON)
 memcheck_x86_linux_CPPFLAGS     = $(AM_CPPFLAGS_X86_LINUX)
diff --git a/memcheck/mc_include.h b/memcheck/mc_include.h
index f15922c..fb0e26b 100644
--- a/memcheck/mc_include.h
+++ b/memcheck/mc_include.h
@@ -101,9 +101,9 @@
 
 /* Shadow memory functions */
 extern Bool MC_(check_mem_is_noaccess)( Addr a, SizeT len, Addr* bad_addr );
-extern void MC_(make_mem_noaccess) ( Addr a, SizeT len );
-extern void MC_(make_mem_undefined)( Addr a, SizeT len );
-extern void MC_(make_mem_defined)  ( Addr a, SizeT len );
+extern void MC_(make_mem_noaccess)        ( Addr a, SizeT len );
+extern void MC_(make_mem_undefined_w_otag)( Addr a, SizeT len, UInt otag );
+extern void MC_(make_mem_defined)         ( Addr a, SizeT len );
 extern void MC_(copy_address_range_state) ( Addr src, Addr dst, SizeT len );
 
 extern void MC_(print_malloc_stats) ( void );
@@ -118,6 +118,50 @@
 extern void  MC_(__builtin_vec_delete) ( ThreadId tid, void* p );
 extern void* MC_(realloc)              ( ThreadId tid, void* p, SizeT new_size );
 
+/*------------------------------------------------------------*/
+/*--- Origin tracking translate-time support               ---*/
+/*------------------------------------------------------------*/
+
+/* See detailed comments in mc_machine.c. */
+extern 
+Int MC_(get_otrack_shadow_offset) ( Int offset, Int szB );
+extern 
+IRType MC_(get_otrack_reg_array_equiv_int_type) ( IRRegArray* arr );
+
+/* Constants which are used as the lowest 2 bits in origin tags.
+   
+   An origin tag comprises an upper 30-bit ECU field and a lower 2-bit
+   'kind' field.  The ECU field is a number given out by m_execontext
+   and has a 1-1 mapping with ExeContext*s.  An ECU can be used
+   directly as an origin tag (otag), but in fact we want to put
+   additional information 'kind' field to indicate roughly where the
+   tag came from.  This helps print more understandable error messages
+   for the user -- it has no other purpose.
+
+   Hence the following 2-bit constants are needed for 'kind' field. 
+
+   To summarise:
+
+   * Both ECUs and origin tags are represented as 32-bit words
+
+   * m_execontext and the core-tool interface deal purely in ECUs.
+     They have no knowledge of origin tags - that is a purely
+     Memcheck-internal matter.
+
+   * all valid ECUs have the lowest 2 bits zero and at least
+     one of the upper 30 bits nonzero (see VG_(is_plausible_ECU))
+
+   * to convert from an ECU to an otag, OR in one of the MC_OKIND_
+     constants below
+
+   * to convert an otag back to an ECU, AND it with ~3
+*/
+
+#define MC_OKIND_UNKNOWN  0  /* unknown origin */
+#define MC_OKIND_HEAP     1  /* this is a heap origin */
+#define MC_OKIND_STACK    2  /* this is a stack origin */
+#define MC_OKIND_USER     3  /* arises from user-supplied client req */
+
 
 /*------------------------------------------------------------*/
 /*--- Profiling of memory events                           ---*/
@@ -270,18 +314,6 @@
  * default: NO */
 extern Bool MC_(clo_workaround_gcc296_bugs);
 
-/* Do undefined value checking? "No" gives Addrcheck-style behaviour, ie.
- * faster but fewer errors found.  Note that although Addrcheck had 1 bit
- * per byte overhead vs the old Memcheck's 9 bits per byte, with this mode
- * and compressed V bits, no memory is saved with this mode -- it's still
- * 2 bits per byte overhead.  This is a little wasteful -- it could be done
- * with 1 bit per byte -- but lets us reuse the many shadow memory access
- * functions.  Note also that in this mode the secondary V bit table is
- * never used.
- *
- * default: YES */
-extern Bool MC_(clo_undef_value_errors);
-
 /* Fill malloc-d/free-d client blocks with a specific value?  -1 if
    not, else 0x00 .. 0xFF indicating the fill value to use.  Can be
    useful for causing programs with bad heap corruption to fail in
@@ -291,18 +323,57 @@
 extern Int MC_(clo_malloc_fill);
 extern Int MC_(clo_free_fill);
 
+/* Indicates the level of instrumentation/checking done by Memcheck.
+
+   1 = No undefined value checking, Addrcheck-style behaviour only:
+       only address checking is done.  This is faster but finds fewer
+       errors.  Note that although Addrcheck had 1 bit per byte
+       overhead vs the old Memcheck's 9 bits per byte, with this mode
+       and compressed V bits, no memory is saved with this mode --
+       it's still 2 bits per byte overhead.  This is a little wasteful
+       -- it could be done with 1 bit per byte -- but lets us reuse
+       the many shadow memory access functions.  Note that in this
+       mode neither the secondary V bit table nor the origin-tag cache
+       are used.
+
+   2 = Address checking and Undefined value checking are performed,
+       but origins are not tracked.  So the origin-tag cache is not
+       used in this mode.  This setting is the default and corresponds
+       to the "normal" Memcheck behaviour that has shipped for years.
+
+   3 = Address checking, undefined value checking, and origins for
+       undefined values are tracked.
+
+   The default is 2.
+*/
+extern Int MC_(clo_mc_level);
+
 
 /*------------------------------------------------------------*/
 /*--- Instrumentation                                      ---*/
 /*------------------------------------------------------------*/
 
 /* Functions defined in mc_main.c */
-extern VG_REGPARM(1) void MC_(helperc_complain_undef) ( HWord );
-extern void MC_(helperc_value_check8_fail) ( void );
-extern void MC_(helperc_value_check4_fail) ( void );
-extern void MC_(helperc_value_check1_fail) ( void );
-extern void MC_(helperc_value_check0_fail) ( void );
 
+/* For the fail_w_o functions, the UWord arg is actually the 32-bit
+   origin tag and should really be UInt, but to be simple and safe
+   considering it's called from generated code, just claim it to be a
+   UWord. */
+extern VG_REGPARM(2) void MC_(helperc_value_checkN_fail_w_o) ( HWord, UWord );
+extern VG_REGPARM(1) void MC_(helperc_value_check8_fail_w_o) ( UWord );
+extern VG_REGPARM(1) void MC_(helperc_value_check4_fail_w_o) ( UWord );
+extern VG_REGPARM(1) void MC_(helperc_value_check1_fail_w_o) ( UWord );
+extern VG_REGPARM(1) void MC_(helperc_value_check0_fail_w_o) ( UWord );
+
+/* And call these ones instead to report an uninitialised value error
+   but with no origin available. */
+extern VG_REGPARM(1) void MC_(helperc_value_checkN_fail_no_o) ( HWord );
+extern VG_REGPARM(0) void MC_(helperc_value_check8_fail_no_o) ( void );
+extern VG_REGPARM(0) void MC_(helperc_value_check4_fail_no_o) ( void );
+extern VG_REGPARM(0) void MC_(helperc_value_check1_fail_no_o) ( void );
+extern VG_REGPARM(0) void MC_(helperc_value_check0_fail_no_o) ( void );
+
+/* V-bits load/store helpers */
 extern VG_REGPARM(1) void MC_(helperc_STOREV64be) ( Addr, ULong );
 extern VG_REGPARM(1) void MC_(helperc_STOREV64le) ( Addr, ULong );
 extern VG_REGPARM(2) void MC_(helperc_STOREV32be) ( Addr, UWord );
@@ -319,7 +390,20 @@
 extern VG_REGPARM(1) UWord MC_(helperc_LOADV16le) ( Addr );
 extern VG_REGPARM(1) UWord MC_(helperc_LOADV8)    ( Addr );
 
-extern void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len );
+extern void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
+                                                        Addr nia );
+
+/* Origin tag load/store helpers */
+VG_REGPARM(2) void  MC_(helperc_b_store1) ( Addr a, UWord d32 );
+VG_REGPARM(2) void  MC_(helperc_b_store2) ( Addr a, UWord d32 );
+VG_REGPARM(2) void  MC_(helperc_b_store4) ( Addr a, UWord d32 );
+VG_REGPARM(2) void  MC_(helperc_b_store8) ( Addr a, UWord d32 );
+VG_REGPARM(2) void  MC_(helperc_b_store16)( Addr a, UWord d32 );
+VG_REGPARM(1) UWord MC_(helperc_b_load1) ( Addr a );
+VG_REGPARM(1) UWord MC_(helperc_b_load2) ( Addr a );
+VG_REGPARM(1) UWord MC_(helperc_b_load4) ( Addr a );
+VG_REGPARM(1) UWord MC_(helperc_b_load8) ( Addr a );
+VG_REGPARM(1) UWord MC_(helperc_b_load16)( Addr a );
 
 /* Functions defined in mc_translate.c */
 extern
diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c
new file mode 100644
index 0000000..d8072fe
--- /dev/null
+++ b/memcheck/mc_machine.c
@@ -0,0 +1,754 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Contains machine-specific (guest-state-layout-specific)      ---*/
+/*--- support for origin tracking.                                 ---*/
+/*---                                                 mc_machine.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of MemCheck, a heavyweight Valgrind tool for
+   detecting memory errors.
+
+   Copyright (C) 2008-2008 OpenWorks Ltd
+      info@open-works.co.uk
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "pub_tool_basics.h"
+#include "pub_tool_hashtable.h"     // For mc_include.h
+#include "pub_tool_libcassert.h"
+#include "pub_tool_libcprint.h"
+#include "pub_tool_tooliface.h"
+
+#include "mc_include.h"
+
+#undef MC_SIZEOF_GUEST_STATE
+
+#if defined(VGA_x86)
+# include "libvex_guest_x86.h"
+# define MC_SIZEOF_GUEST_STATE sizeof(VexGuestX86State)
+#endif
+
+#if defined(VGA_amd64)
+# include "libvex_guest_amd64.h"
+# define MC_SIZEOF_GUEST_STATE sizeof(VexGuestAMD64State)
+#endif
+
+#if defined(VGA_ppc32)
+# include "libvex_guest_ppc32.h"
+# define MC_SIZEOF_GUEST_STATE sizeof(VexGuestPPC32State)
+#endif
+
+#if defined(VGA_ppc64)
+# include "libvex_guest_ppc64.h"
+# define MC_SIZEOF_GUEST_STATE sizeof(VexGuestPPC64State)
+#endif
+
+static inline Bool host_is_big_endian ( void ) {
+   UInt x = 0x11223344;
+   return 0x1122 == *(UShort*)(&x);
+}
+static inline Bool host_is_little_endian ( void ) {
+   UInt x = 0x11223344;
+   return 0x3344 == *(UShort*)(&x);
+}
+
+
+/* Let (offset,szB) describe a reference to the guest state section
+   [offset, offset+szB).
+
+   This function returns the corresponding guest state reference to be
+   used for the origin tag (which of course will be in the second
+   shadow area), or -1 if this piece of guest state is not to be
+   tracked.
+
+   Since origin tags are 32-bits long, we expect any returned value
+   (except -1) to be a multiple of 4, between 0 and
+   sizeof(guest-state)-4 inclusive.
+
+   This is inherently (guest-)architecture specific.  For x86 and
+   amd64 we do some somewhat tricky things to give %AH .. %DH their
+   own tags.  On ppc32/64 we do some marginally tricky things to give
+   all 16 %CR components their own tags.
+
+   This function only deals with references to the guest state whose
+   offsets are known at translation time (that is, references arising
+   from Put and Get).  References whose offset is not known until run
+   time (that is, arise from PutI and GetI) are handled by
+   MC_(get_otrack_reg_array_equiv_int_type) below.
+
+   Note that since some guest state arrays (eg, the x86 FP reg stack)
+   are accessed both as arrays (eg, x87 insns) and directly (eg, MMX
+   insns), the two functions must be consistent for those sections of
+   guest state -- that is, they must both say the area is shadowed, or
+   both say it is not.
+
+   This function is dependent on the host's endianness, hence we
+   assert that the use case is supported.
+*/
+static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB ); /*fwds*/
+
+Int MC_(get_otrack_shadow_offset) ( Int offset, Int szB )
+{
+   Int cand = get_otrack_shadow_offset_wrk( offset, szB );
+   if (cand == -1) 
+      return cand;
+   tl_assert(0 == (cand & 3));
+   tl_assert(cand <= MC_SIZEOF_GUEST_STATE-4);
+   return cand;
+}
+
+
+static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
+{
+   /* -------------------- ppc64 -------------------- */
+
+#  if defined(VGA_ppc64)
+
+#  define GOF(_fieldname) \
+      (offsetof(VexGuestPPC64State,guest_##_fieldname))
+#  define SZB(_fieldname) \
+      (sizeof(((VexGuestPPC64State*)0)->guest_##_fieldname))
+
+   Int  sz   = szB;
+   Int  o    = offset;
+   tl_assert(sz > 0);
+   tl_assert(host_is_big_endian());
+
+   if (sz == 8 || sz == 4) {
+      /* The point of this is to achieve
+         if ((o == GOF(GPRn) && sz == 8) || (o == 4+GOF(GPRn) && sz == 4))
+            return GOF(GPRn);
+         by testing ox instead of o, and setting ox back 4 bytes when sz == 4.
+      */
+      Bool ox = sz == 8 ? o : (o - 4);
+      if (ox == GOF(GPR0)) return ox;
+      if (ox == GOF(GPR1)) return ox;
+      if (ox == GOF(GPR2)) return ox;
+      if (ox == GOF(GPR3)) return ox;
+      if (ox == GOF(GPR4)) return ox;
+      if (ox == GOF(GPR5)) return ox;
+      if (ox == GOF(GPR6)) return ox;
+      if (ox == GOF(GPR7)) return ox;
+      if (ox == GOF(GPR8)) return ox;
+      if (ox == GOF(GPR9)) return ox;
+      if (ox == GOF(GPR10)) return ox;
+      if (ox == GOF(GPR11)) return ox;
+      if (ox == GOF(GPR12)) return ox;
+      if (ox == GOF(GPR13)) return ox;
+      if (ox == GOF(GPR14)) return ox;
+      if (ox == GOF(GPR15)) return ox;
+      if (ox == GOF(GPR16)) return ox;
+      if (ox == GOF(GPR17)) return ox;
+      if (ox == GOF(GPR18)) return ox;
+      if (ox == GOF(GPR19)) return ox;
+      if (ox == GOF(GPR20)) return ox;
+      if (ox == GOF(GPR21)) return ox;
+      if (ox == GOF(GPR22)) return ox;
+      if (ox == GOF(GPR23)) return ox;
+      if (ox == GOF(GPR24)) return ox;
+      if (ox == GOF(GPR25)) return ox;
+      if (ox == GOF(GPR26)) return ox;
+      if (ox == GOF(GPR27)) return ox;
+      if (ox == GOF(GPR28)) return ox;
+      if (ox == GOF(GPR29)) return ox;
+      if (ox == GOF(GPR30)) return ox;
+      if (ox == GOF(GPR31)) return ox;
+   }
+
+   if (o == GOF(LR)  && sz == 8) return o;
+   if (o == GOF(CTR) && sz == 8) return o;
+
+   if (o == GOF(CIA)       && sz == 8) return -1;
+   if (o == GOF(CIA_AT_SC) && sz == 8) return -1;
+   if (o == GOF(RESVN)     && sz == 8) return -1;
+   if (o == GOF(FPROUND)   && sz == 4) return -1;
+   if (o == GOF(EMWARN)    && sz == 4) return -1;
+   if (o == GOF(TISTART)   && sz == 8) return -1;
+   if (o == GOF(TILEN)     && sz == 8) return -1;
+   if (o == GOF(VSCR)      && sz == 4) return -1;
+   if (o == GOF(VRSAVE)    && sz == 4) return -1;
+   if (o == GOF(REDIR_SP)  && sz == 8) return -1;
+
+   tl_assert(SZB(FPR0) == 8);
+   if (o == GOF(FPR0) && sz == 8) return o;
+   if (o == GOF(FPR1) && sz == 8) return o;
+   if (o == GOF(FPR2) && sz == 8) return o;
+   if (o == GOF(FPR3) && sz == 8) return o;
+   if (o == GOF(FPR4) && sz == 8) return o;
+   if (o == GOF(FPR5) && sz == 8) return o;
+   if (o == GOF(FPR6) && sz == 8) return o;
+   if (o == GOF(FPR7) && sz == 8) return o;
+   if (o == GOF(FPR8) && sz == 8) return o;
+   if (o == GOF(FPR9) && sz == 8) return o;
+   if (o == GOF(FPR10) && sz == 8) return o;
+   if (o == GOF(FPR11) && sz == 8) return o;
+   if (o == GOF(FPR12) && sz == 8) return o;
+   if (o == GOF(FPR13) && sz == 8) return o;
+   if (o == GOF(FPR14) && sz == 8) return o;
+   if (o == GOF(FPR15) && sz == 8) return o;
+   if (o == GOF(FPR16) && sz == 8) return o;
+   if (o == GOF(FPR17) && sz == 8) return o;
+   if (o == GOF(FPR18) && sz == 8) return o;
+   if (o == GOF(FPR19) && sz == 8) return o;
+   if (o == GOF(FPR20) && sz == 8) return o;
+   if (o == GOF(FPR21) && sz == 8) return o;
+   if (o == GOF(FPR22) && sz == 8) return o;
+   if (o == GOF(FPR23) && sz == 8) return o;
+   if (o == GOF(FPR24) && sz == 8) return o;
+   if (o == GOF(FPR25) && sz == 8) return o;
+   if (o == GOF(FPR26) && sz == 8) return o;
+   if (o == GOF(FPR27) && sz == 8) return o;
+   if (o == GOF(FPR28) && sz == 8) return o;
+   if (o == GOF(FPR29) && sz == 8) return o;
+   if (o == GOF(FPR30) && sz == 8) return o;
+   if (o == GOF(FPR31) && sz == 8) return o;
+
+   /* For the various byte sized XER/CR pieces, use offset 8
+      in VR0 .. VR31. */
+   tl_assert(SZB(VR0) == 16);
+   if (o == GOF(XER_SO) && sz == 1) return 8 +GOF(VR0);
+   if (o == GOF(XER_OV) && sz == 1) return 8 +GOF(VR1);
+   if (o == GOF(XER_CA) && sz == 1) return 8 +GOF(VR2);
+   if (o == GOF(XER_BC) && sz == 1) return 8 +GOF(VR3);
+
+   if (o == GOF(CR0_321) && sz == 1) return 8 +GOF(VR4);
+   if (o == GOF(CR0_0)   && sz == 1) return 8 +GOF(VR5);
+   if (o == GOF(CR1_321) && sz == 1) return 8 +GOF(VR6);
+   if (o == GOF(CR1_0)   && sz == 1) return 8 +GOF(VR7);
+   if (o == GOF(CR2_321) && sz == 1) return 8 +GOF(VR8);
+   if (o == GOF(CR2_0)   && sz == 1) return 8 +GOF(VR9);
+   if (o == GOF(CR3_321) && sz == 1) return 8 +GOF(VR10);
+   if (o == GOF(CR3_0)   && sz == 1) return 8 +GOF(VR11);
+   if (o == GOF(CR4_321) && sz == 1) return 8 +GOF(VR12);
+   if (o == GOF(CR4_0)   && sz == 1) return 8 +GOF(VR13);
+   if (o == GOF(CR5_321) && sz == 1) return 8 +GOF(VR14);
+   if (o == GOF(CR5_0)   && sz == 1) return 8 +GOF(VR15);
+   if (o == GOF(CR6_321) && sz == 1) return 8 +GOF(VR16);
+   if (o == GOF(CR6_0)   && sz == 1) return 8 +GOF(VR17);
+   if (o == GOF(CR7_321) && sz == 1) return 8 +GOF(VR18);
+   if (o == GOF(CR7_0)   && sz == 1) return 8 +GOF(VR19);
+
+   /* Vector registers .. use offset 0 in VR0 .. VR31. */
+   if (o >= GOF(VR0)  && o+sz <= GOF(VR0) +SZB(VR0))  return 0+ GOF(VR0);
+   if (o >= GOF(VR1)  && o+sz <= GOF(VR1) +SZB(VR1))  return 0+ GOF(VR1);
+   if (o >= GOF(VR2)  && o+sz <= GOF(VR2) +SZB(VR2))  return 0+ GOF(VR2);
+   if (o >= GOF(VR3)  && o+sz <= GOF(VR3) +SZB(VR3))  return 0+ GOF(VR3);
+   if (o >= GOF(VR4)  && o+sz <= GOF(VR4) +SZB(VR4))  return 0+ GOF(VR4);
+   if (o >= GOF(VR5)  && o+sz <= GOF(VR5) +SZB(VR5))  return 0+ GOF(VR5);
+   if (o >= GOF(VR6)  && o+sz <= GOF(VR6) +SZB(VR6))  return 0+ GOF(VR6);
+   if (o >= GOF(VR7)  && o+sz <= GOF(VR7) +SZB(VR7))  return 0+ GOF(VR7);
+   if (o >= GOF(VR8)  && o+sz <= GOF(VR8) +SZB(VR8))  return 0+ GOF(VR8);
+   if (o >= GOF(VR9)  && o+sz <= GOF(VR9) +SZB(VR9))  return 0+ GOF(VR9);
+   if (o >= GOF(VR10) && o+sz <= GOF(VR10)+SZB(VR10)) return 0+ GOF(VR10);
+   if (o >= GOF(VR11) && o+sz <= GOF(VR11)+SZB(VR11)) return 0+ GOF(VR11);
+   if (o >= GOF(VR12) && o+sz <= GOF(VR12)+SZB(VR12)) return 0+ GOF(VR12);
+   if (o >= GOF(VR13) && o+sz <= GOF(VR13)+SZB(VR13)) return 0+ GOF(VR13);
+   if (o >= GOF(VR14) && o+sz <= GOF(VR14)+SZB(VR14)) return 0+ GOF(VR14);
+   if (o >= GOF(VR15) && o+sz <= GOF(VR15)+SZB(VR15)) return 0+ GOF(VR15);
+   if (o >= GOF(VR16) && o+sz <= GOF(VR16)+SZB(VR16)) return 0+ GOF(VR16);
+   if (o >= GOF(VR17) && o+sz <= GOF(VR17)+SZB(VR17)) return 0+ GOF(VR17);
+   if (o >= GOF(VR18) && o+sz <= GOF(VR18)+SZB(VR18)) return 0+ GOF(VR18);
+   if (o >= GOF(VR19) && o+sz <= GOF(VR19)+SZB(VR19)) return 0+ GOF(VR19);
+   if (o >= GOF(VR20) && o+sz <= GOF(VR20)+SZB(VR20)) return 0+ GOF(VR20);
+   if (o >= GOF(VR21) && o+sz <= GOF(VR21)+SZB(VR21)) return 0+ GOF(VR21);
+   if (o >= GOF(VR22) && o+sz <= GOF(VR22)+SZB(VR22)) return 0+ GOF(VR22);
+   if (o >= GOF(VR23) && o+sz <= GOF(VR23)+SZB(VR23)) return 0+ GOF(VR23);
+   if (o >= GOF(VR24) && o+sz <= GOF(VR24)+SZB(VR24)) return 0+ GOF(VR24);
+   if (o >= GOF(VR25) && o+sz <= GOF(VR25)+SZB(VR25)) return 0+ GOF(VR25);
+   if (o >= GOF(VR26) && o+sz <= GOF(VR26)+SZB(VR26)) return 0+ GOF(VR26);
+   if (o >= GOF(VR27) && o+sz <= GOF(VR27)+SZB(VR27)) return 0+ GOF(VR27);
+   if (o >= GOF(VR28) && o+sz <= GOF(VR28)+SZB(VR28)) return 0+ GOF(VR28);
+   if (o >= GOF(VR29) && o+sz <= GOF(VR29)+SZB(VR29)) return 0+ GOF(VR29);
+   if (o >= GOF(VR30) && o+sz <= GOF(VR30)+SZB(VR30)) return 0+ GOF(VR30);
+   if (o >= GOF(VR31) && o+sz <= GOF(VR31)+SZB(VR31)) return 0+ GOF(VR31);
+
+   VG_(printf)("MC_(get_otrack_shadow_offset)(ppc64)(off=%d,sz=%d)\n",
+               offset,szB);
+   tl_assert(0);
+#  undef GOF
+#  undef SZB
+
+   /* -------------------- ppc32 -------------------- */
+
+#  elif defined(VGA_ppc32)
+
+#  define GOF(_fieldname) \
+      (offsetof(VexGuestPPC32State,guest_##_fieldname))
+#  define SZB(_fieldname) \
+      (sizeof(((VexGuestPPC32State*)0)->guest_##_fieldname))
+   Int  o  = offset;
+   Int  sz = szB;
+   tl_assert(sz > 0);
+   tl_assert(host_is_big_endian());
+
+   if (o == GOF(GPR0) && sz == 4) return o;
+   if (o == GOF(GPR1) && sz == 4) return o;
+   if (o == GOF(GPR2) && sz == 4) return o;
+   if (o == GOF(GPR3) && sz == 4) return o;
+   if (o == GOF(GPR4) && sz == 4) return o;
+   if (o == GOF(GPR5) && sz == 4) return o;
+   if (o == GOF(GPR6) && sz == 4) return o;
+   if (o == GOF(GPR7) && sz == 4) return o;
+   if (o == GOF(GPR8) && sz == 4) return o;
+   if (o == GOF(GPR9) && sz == 4) return o;
+   if (o == GOF(GPR10) && sz == 4) return o;
+   if (o == GOF(GPR11) && sz == 4) return o;
+   if (o == GOF(GPR12) && sz == 4) return o;
+   if (o == GOF(GPR13) && sz == 4) return o;
+   if (o == GOF(GPR14) && sz == 4) return o;
+   if (o == GOF(GPR15) && sz == 4) return o;
+   if (o == GOF(GPR16) && sz == 4) return o;
+   if (o == GOF(GPR17) && sz == 4) return o;
+   if (o == GOF(GPR18) && sz == 4) return o;
+   if (o == GOF(GPR19) && sz == 4) return o;
+   if (o == GOF(GPR20) && sz == 4) return o;
+   if (o == GOF(GPR21) && sz == 4) return o;
+   if (o == GOF(GPR22) && sz == 4) return o;
+   if (o == GOF(GPR23) && sz == 4) return o;
+   if (o == GOF(GPR24) && sz == 4) return o;
+   if (o == GOF(GPR25) && sz == 4) return o;
+   if (o == GOF(GPR26) && sz == 4) return o;
+   if (o == GOF(GPR27) && sz == 4) return o;
+   if (o == GOF(GPR28) && sz == 4) return o;
+   if (o == GOF(GPR29) && sz == 4) return o;
+   if (o == GOF(GPR30) && sz == 4) return o;
+   if (o == GOF(GPR31) && sz == 4) return o;
+
+   if (o == GOF(LR)  && sz == 4) return o;
+   if (o == GOF(CTR) && sz == 4) return o;
+
+   if (o == GOF(CIA)       && sz == 4) return -1;
+   if (o == GOF(CIA_AT_SC) && sz == 4) return -1;
+   if (o == GOF(RESVN)     && sz == 4) return -1;
+   if (o == GOF(FPROUND)   && sz == 4) return -1;
+   if (o == GOF(EMWARN)    && sz == 4) return -1;
+   if (o == GOF(TISTART)   && sz == 4) return -1;
+   if (o == GOF(TILEN)     && sz == 4) return -1;
+   if (o == GOF(VSCR)      && sz == 4) return -1;
+   if (o == GOF(REDIR_SP)  && sz == 4) return -1;
+   if (o == GOF(SPRG3_RO)  && sz == 4) return -1;
+
+   tl_assert(SZB(FPR0) == 8);
+   if (o == GOF(FPR0) && sz == 8) return o;
+   if (o == GOF(FPR1) && sz == 8) return o;
+   if (o == GOF(FPR2) && sz == 8) return o;
+   if (o == GOF(FPR3) && sz == 8) return o;
+   if (o == GOF(FPR4) && sz == 8) return o;
+   if (o == GOF(FPR5) && sz == 8) return o;
+   if (o == GOF(FPR6) && sz == 8) return o;
+   if (o == GOF(FPR7) && sz == 8) return o;
+   if (o == GOF(FPR8) && sz == 8) return o;
+   if (o == GOF(FPR9) && sz == 8) return o;
+   if (o == GOF(FPR10) && sz == 8) return o;
+   if (o == GOF(FPR11) && sz == 8) return o;
+   if (o == GOF(FPR12) && sz == 8) return o;
+   if (o == GOF(FPR13) && sz == 8) return o;
+   if (o == GOF(FPR14) && sz == 8) return o;
+   if (o == GOF(FPR15) && sz == 8) return o;
+   if (o == GOF(FPR16) && sz == 8) return o;
+   if (o == GOF(FPR17) && sz == 8) return o;
+   if (o == GOF(FPR18) && sz == 8) return o;
+   if (o == GOF(FPR19) && sz == 8) return o;
+   if (o == GOF(FPR20) && sz == 8) return o;
+   if (o == GOF(FPR21) && sz == 8) return o;
+   if (o == GOF(FPR22) && sz == 8) return o;
+   if (o == GOF(FPR23) && sz == 8) return o;
+   if (o == GOF(FPR24) && sz == 8) return o;
+   if (o == GOF(FPR25) && sz == 8) return o;
+   if (o == GOF(FPR26) && sz == 8) return o;
+   if (o == GOF(FPR27) && sz == 8) return o;
+   if (o == GOF(FPR28) && sz == 8) return o;
+   if (o == GOF(FPR29) && sz == 8) return o;
+   if (o == GOF(FPR30) && sz == 8) return o;
+   if (o == GOF(FPR31) && sz == 8) return o;
+
+   /* For the various byte sized XER/CR pieces, use offset 8
+      in VR0 .. VR31. */
+   tl_assert(SZB(VR0) == 16);
+   if (o == GOF(XER_SO) && sz == 1) return 8 +GOF(VR0);
+   if (o == GOF(XER_OV) && sz == 1) return 8 +GOF(VR1);
+   if (o == GOF(XER_CA) && sz == 1) return 8 +GOF(VR2);
+   if (o == GOF(XER_BC) && sz == 1) return 8 +GOF(VR3);
+
+   if (o == GOF(CR0_321) && sz == 1) return 8 +GOF(VR4);
+   if (o == GOF(CR0_0)   && sz == 1) return 8 +GOF(VR5);
+   if (o == GOF(CR1_321) && sz == 1) return 8 +GOF(VR6);
+   if (o == GOF(CR1_0)   && sz == 1) return 8 +GOF(VR7);
+   if (o == GOF(CR2_321) && sz == 1) return 8 +GOF(VR8);
+   if (o == GOF(CR2_0)   && sz == 1) return 8 +GOF(VR9);
+   if (o == GOF(CR3_321) && sz == 1) return 8 +GOF(VR10);
+   if (o == GOF(CR3_0)   && sz == 1) return 8 +GOF(VR11);
+   if (o == GOF(CR4_321) && sz == 1) return 8 +GOF(VR12);
+   if (o == GOF(CR4_0)   && sz == 1) return 8 +GOF(VR13);
+   if (o == GOF(CR5_321) && sz == 1) return 8 +GOF(VR14);
+   if (o == GOF(CR5_0)   && sz == 1) return 8 +GOF(VR15);
+   if (o == GOF(CR6_321) && sz == 1) return 8 +GOF(VR16);
+   if (o == GOF(CR6_0)   && sz == 1) return 8 +GOF(VR17);
+   if (o == GOF(CR7_321) && sz == 1) return 8 +GOF(VR18);
+   if (o == GOF(CR7_0)   && sz == 1) return 8 +GOF(VR19);
+
+   /* Vector registers .. use offset 0 in VR0 .. VR31. */
+   if (o >= GOF(VR0)  && o+sz <= GOF(VR0) +SZB(VR0))  return 0+ GOF(VR0);
+   if (o >= GOF(VR1)  && o+sz <= GOF(VR1) +SZB(VR1))  return 0+ GOF(VR1);
+   if (o >= GOF(VR2)  && o+sz <= GOF(VR2) +SZB(VR2))  return 0+ GOF(VR2);
+   if (o >= GOF(VR3)  && o+sz <= GOF(VR3) +SZB(VR3))  return 0+ GOF(VR3);
+   if (o >= GOF(VR4)  && o+sz <= GOF(VR4) +SZB(VR4))  return 0+ GOF(VR4);
+   if (o >= GOF(VR5)  && o+sz <= GOF(VR5) +SZB(VR5))  return 0+ GOF(VR5);
+   if (o >= GOF(VR6)  && o+sz <= GOF(VR6) +SZB(VR6))  return 0+ GOF(VR6);
+   if (o >= GOF(VR7)  && o+sz <= GOF(VR7) +SZB(VR7))  return 0+ GOF(VR7);
+   if (o >= GOF(VR8)  && o+sz <= GOF(VR8) +SZB(VR8))  return 0+ GOF(VR8);
+   if (o >= GOF(VR9)  && o+sz <= GOF(VR9) +SZB(VR9))  return 0+ GOF(VR9);
+   if (o >= GOF(VR10) && o+sz <= GOF(VR10)+SZB(VR10)) return 0+ GOF(VR10);
+   if (o >= GOF(VR11) && o+sz <= GOF(VR11)+SZB(VR11)) return 0+ GOF(VR11);
+   if (o >= GOF(VR12) && o+sz <= GOF(VR12)+SZB(VR12)) return 0+ GOF(VR12);
+   if (o >= GOF(VR13) && o+sz <= GOF(VR13)+SZB(VR13)) return 0+ GOF(VR13);
+   if (o >= GOF(VR14) && o+sz <= GOF(VR14)+SZB(VR14)) return 0+ GOF(VR14);
+   if (o >= GOF(VR15) && o+sz <= GOF(VR15)+SZB(VR15)) return 0+ GOF(VR15);
+   if (o >= GOF(VR16) && o+sz <= GOF(VR16)+SZB(VR16)) return 0+ GOF(VR16);
+   if (o >= GOF(VR17) && o+sz <= GOF(VR17)+SZB(VR17)) return 0+ GOF(VR17);
+   if (o >= GOF(VR18) && o+sz <= GOF(VR18)+SZB(VR18)) return 0+ GOF(VR18);
+   if (o >= GOF(VR19) && o+sz <= GOF(VR19)+SZB(VR19)) return 0+ GOF(VR19);
+   if (o >= GOF(VR20) && o+sz <= GOF(VR20)+SZB(VR20)) return 0+ GOF(VR20);
+   if (o >= GOF(VR21) && o+sz <= GOF(VR21)+SZB(VR21)) return 0+ GOF(VR21);
+   if (o >= GOF(VR22) && o+sz <= GOF(VR22)+SZB(VR22)) return 0+ GOF(VR22);
+   if (o >= GOF(VR23) && o+sz <= GOF(VR23)+SZB(VR23)) return 0+ GOF(VR23);
+   if (o >= GOF(VR24) && o+sz <= GOF(VR24)+SZB(VR24)) return 0+ GOF(VR24);
+   if (o >= GOF(VR25) && o+sz <= GOF(VR25)+SZB(VR25)) return 0+ GOF(VR25);
+   if (o >= GOF(VR26) && o+sz <= GOF(VR26)+SZB(VR26)) return 0+ GOF(VR26);
+   if (o >= GOF(VR27) && o+sz <= GOF(VR27)+SZB(VR27)) return 0+ GOF(VR27);
+   if (o >= GOF(VR28) && o+sz <= GOF(VR28)+SZB(VR28)) return 0+ GOF(VR28);
+   if (o >= GOF(VR29) && o+sz <= GOF(VR29)+SZB(VR29)) return 0+ GOF(VR29);
+   if (o >= GOF(VR30) && o+sz <= GOF(VR30)+SZB(VR30)) return 0+ GOF(VR30);
+   if (o >= GOF(VR31) && o+sz <= GOF(VR31)+SZB(VR31)) return 0+ GOF(VR31);
+
+   VG_(printf)("MC_(get_otrack_shadow_offset)(ppc32)(off=%d,sz=%d)\n",
+               offset,szB);
+   tl_assert(0);
+#  undef GOF
+#  undef SZB
+
+   /* -------------------- amd64 -------------------- */
+
+#  elif defined(VGA_amd64)
+
+#  define GOF(_fieldname) \
+      (offsetof(VexGuestAMD64State,guest_##_fieldname))
+#  define SZB(_fieldname) \
+      (sizeof(((VexGuestAMD64State*)0)->guest_##_fieldname))
+   Int  o      = offset;
+   Int  sz     = szB;
+   Bool is1248 = sz == 8 || sz == 4 || sz == 2 || sz == 1;
+   tl_assert(sz > 0);
+   tl_assert(host_is_little_endian());
+
+   if (o == GOF(RAX) && is1248) return o;
+   if (o == GOF(RCX) && is1248) return o;
+   if (o == GOF(RDX) && is1248) return o;
+   if (o == GOF(RBX) && is1248) return o;
+   if (o == GOF(RSP) && is1248) return o;
+   if (o == GOF(RBP) && is1248) return o;
+   if (o == GOF(RSI) && is1248) return o;
+   if (o == GOF(RDI) && is1248) return o;
+   if (o == GOF(R8)  && is1248) return o;
+   if (o == GOF(R9)  && is1248) return o;
+   if (o == GOF(R10) && is1248) return o;
+   if (o == GOF(R11) && is1248) return o;
+   if (o == GOF(R12) && is1248) return o;
+   if (o == GOF(R13) && is1248) return o;
+   if (o == GOF(R14) && is1248) return o;
+   if (o == GOF(R15) && is1248) return o;
+
+   if (o == GOF(CC_DEP1) && sz == 8) return o;
+   if (o == GOF(CC_DEP2) && sz == 8) return o;
+
+   if (o == GOF(CC_OP)   && sz == 8) return -1; /* slot used for %AH */
+   if (o == GOF(CC_NDEP) && sz == 8) return -1; /* slot used for %BH */
+   if (o == GOF(DFLAG)   && sz == 8) return -1; /* slot used for %CH */
+   if (o == GOF(RIP)     && sz == 8) return -1; /* slot unused */
+   if (o == GOF(IDFLAG)  && sz == 8) return -1; /* slot used for %DH */
+   if (o == GOF(FS_ZERO) && sz == 8) return -1; /* slot unused */
+
+   /* Treat %AH, %BH, %CH, %DH as independent registers.  To do this
+      requires finding 4 unused 32-bit slots in the second-shadow
+      guest state, respectively: CC_OP CC_NDEP DFLAG IDFLAG, since
+      none of those are tracked. */
+   tl_assert(SZB(CC_OP)   == 8);
+   tl_assert(SZB(CC_NDEP) == 8);
+   tl_assert(SZB(IDFLAG)  == 8);
+   tl_assert(SZB(DFLAG)   == 8);
+
+   if (o == 1+ GOF(RAX) && szB == 1) return GOF(CC_OP);
+   if (o == 1+ GOF(RBX) && szB == 1) return GOF(CC_NDEP);
+   if (o == 1+ GOF(RCX) && szB == 1) return GOF(DFLAG);
+   if (o == 1+ GOF(RDX) && szB == 1) return GOF(IDFLAG);
+
+   /* skip XMM and FP admin stuff */
+   if (o == GOF(SSEROUND) && szB == 8) return -1;
+   if (o == GOF(FTOP)     && szB == 4) return -1;
+   if (o == GOF(FPROUND)  && szB == 8) return -1;
+   if (o == GOF(EMWARN)   && szB == 4) return -1;
+   /* The amd64 front end doesn't actually use FC3210.  It should
+      be done away with.
+      if (offset == offsetof(VexGuestAMD64State,guest_FC3210) && szB==4)
+        return -1;
+   */
+
+   /* XMM registers */
+   if (o >= GOF(XMM0)  && o+sz <= GOF(XMM0) +SZB(XMM0))  return GOF(XMM0);
+   if (o >= GOF(XMM1)  && o+sz <= GOF(XMM1) +SZB(XMM1))  return GOF(XMM1);
+   if (o >= GOF(XMM2)  && o+sz <= GOF(XMM2) +SZB(XMM2))  return GOF(XMM2);
+   if (o >= GOF(XMM3)  && o+sz <= GOF(XMM3) +SZB(XMM3))  return GOF(XMM3);
+   if (o >= GOF(XMM4)  && o+sz <= GOF(XMM4) +SZB(XMM4))  return GOF(XMM4);
+   if (o >= GOF(XMM5)  && o+sz <= GOF(XMM5) +SZB(XMM5))  return GOF(XMM5);
+   if (o >= GOF(XMM6)  && o+sz <= GOF(XMM6) +SZB(XMM6))  return GOF(XMM6);
+   if (o >= GOF(XMM7)  && o+sz <= GOF(XMM7) +SZB(XMM7))  return GOF(XMM7);
+   if (o >= GOF(XMM8)  && o+sz <= GOF(XMM8) +SZB(XMM8))  return GOF(XMM8);
+   if (o >= GOF(XMM9)  && o+sz <= GOF(XMM9) +SZB(XMM9))  return GOF(XMM9);
+   if (o >= GOF(XMM10) && o+sz <= GOF(XMM10)+SZB(XMM10)) return GOF(XMM10);
+   if (o >= GOF(XMM11) && o+sz <= GOF(XMM11)+SZB(XMM11)) return GOF(XMM11);
+   if (o >= GOF(XMM12) && o+sz <= GOF(XMM12)+SZB(XMM12)) return GOF(XMM12);
+   if (o >= GOF(XMM13) && o+sz <= GOF(XMM13)+SZB(XMM13)) return GOF(XMM13);
+   if (o >= GOF(XMM14) && o+sz <= GOF(XMM14)+SZB(XMM14)) return GOF(XMM14);
+   if (o >= GOF(XMM15) && o+sz <= GOF(XMM15)+SZB(XMM15)) return GOF(XMM15);
+
+   /* MMX accesses to FP regs */
+   if (o == GOF(FPREG[0]) && sz == 8) return o;
+   if (o == GOF(FPREG[1]) && sz == 8) return o;
+   if (o == GOF(FPREG[2]) && sz == 8) return o;
+   if (o == GOF(FPREG[3]) && sz == 8) return o;
+   if (o == GOF(FPREG[4]) && sz == 8) return o;
+   if (o == GOF(FPREG[5]) && sz == 8) return o;
+   if (o == GOF(FPREG[6]) && sz == 8) return o;
+   if (o == GOF(FPREG[7]) && sz == 8) return o;
+
+   /* Map high halves of %RAX,%RCX,%RDX,%RBX to the whole register.
+      This is needed because the general handling of dirty helper
+      calls is done in 4 byte chunks.  Hence we will see these.
+      Currently we only expect to see artefacts from CPUID. */
+   if (o == 4+ GOF(RAX) && sz == 4) return GOF(RAX);
+   if (o == 4+ GOF(RCX) && sz == 4) return GOF(RCX);
+   if (o == 4+ GOF(RDX) && sz == 4) return GOF(RDX);
+   if (o == 4+ GOF(RBX) && sz == 4) return GOF(RBX);
+
+   VG_(printf)("MC_(get_otrack_shadow_offset)(amd64)(off=%d,sz=%d)\n",
+               offset,szB);
+   tl_assert(0);
+#  undef GOF
+#  undef SZB
+
+   /* --------------------- x86 --------------------- */
+
+#  elif defined(VGA_x86)
+
+#  define GOF(_fieldname) \
+      (offsetof(VexGuestX86State,guest_##_fieldname))
+#  define SZB(_fieldname) \
+      (sizeof(((VexGuestX86State*)0)->guest_##_fieldname))
+
+   Int  o     = offset;
+   Int  sz    = szB;
+   Bool is124 = sz == 4 || sz == 2 || sz == 1;
+   tl_assert(sz > 0);
+   tl_assert(host_is_little_endian());
+
+   if (o == GOF(EAX) && is124) return o;
+   if (o == GOF(ECX) && is124) return o;
+   if (o == GOF(EDX) && is124) return o;
+   if (o == GOF(EBX) && is124) return o;
+   if (o == GOF(ESP) && is124) return o;
+   if (o == GOF(EBP) && is124) return o;
+   if (o == GOF(ESI) && is124) return o;
+   if (o == GOF(EDI) && is124) return o;
+
+   if (o == GOF(CC_DEP1) && sz == 4) return o;
+   if (o == GOF(CC_DEP2) && sz == 4) return o;
+
+   if (o == GOF(CC_OP)   && sz == 4) return -1; /* slot used for %AH */
+   if (o == GOF(CC_NDEP) && sz == 4) return -1; /* slot used for %BH */
+   if (o == GOF(DFLAG)   && sz == 4) return -1; /* slot used for %CH */
+   if (o == GOF(EIP)     && sz == 4) return -1; /* slot unused */
+   if (o == GOF(IDFLAG)  && sz == 4) return -1; /* slot used for %DH */
+   if (o == GOF(ACFLAG)  && sz == 4) return -1; /* slot unused */
+
+   /* Treat %AH, %BH, %CH, %DH as independent registers.  To do this
+      requires finding 4 unused 32-bit slots in the second-shadow
+      guest state, respectively: CC_OP CC_NDEP DFLAG IDFLAG since none
+      of those are tracked. */
+   tl_assert(SZB(CC_OP)   == 4);
+   tl_assert(SZB(CC_NDEP) == 4);
+   tl_assert(SZB(DFLAG)   == 4);
+   tl_assert(SZB(IDFLAG)  == 4);
+   if (o == 1+ GOF(EAX) && szB == 1) return GOF(CC_OP);
+   if (o == 1+ GOF(EBX) && szB == 1) return GOF(CC_NDEP);
+   if (o == 1+ GOF(ECX) && szB == 1) return GOF(DFLAG);
+   if (o == 1+ GOF(EDX) && szB == 1) return GOF(IDFLAG);
+
+   /* skip XMM and FP admin stuff */
+   if (o == GOF(SSEROUND) && szB == 4) return -1;
+   if (o == GOF(FTOP)     && szB == 4) return -1;
+   if (o == GOF(FPROUND)  && szB == 4) return -1;
+   if (o == GOF(EMWARN)   && szB == 4) return -1;
+   if (o == GOF(FC3210)   && szB == 4) return -1;
+
+   /* XMM registers */
+   if (o >= GOF(XMM0)  && o+sz <= GOF(XMM0)+SZB(XMM0)) return GOF(XMM0);
+   if (o >= GOF(XMM1)  && o+sz <= GOF(XMM1)+SZB(XMM1)) return GOF(XMM1);
+   if (o >= GOF(XMM2)  && o+sz <= GOF(XMM2)+SZB(XMM2)) return GOF(XMM2);
+   if (o >= GOF(XMM3)  && o+sz <= GOF(XMM3)+SZB(XMM3)) return GOF(XMM3);
+   if (o >= GOF(XMM4)  && o+sz <= GOF(XMM4)+SZB(XMM4)) return GOF(XMM4);
+   if (o >= GOF(XMM5)  && o+sz <= GOF(XMM5)+SZB(XMM5)) return GOF(XMM5);
+   if (o >= GOF(XMM6)  && o+sz <= GOF(XMM6)+SZB(XMM6)) return GOF(XMM6);
+   if (o >= GOF(XMM7)  && o+sz <= GOF(XMM7)+SZB(XMM7)) return GOF(XMM7);
+
+   /* MMX accesses to FP regs.  Need to allow for 32-bit references
+      due to dirty helpers for frstor etc, which reference the entire
+      64-byte block in one go. */
+   if (o >= GOF(FPREG[0])
+       && o+sz <= GOF(FPREG[0])+SZB(FPREG[0])) return GOF(FPREG[0]);
+   if (o >= GOF(FPREG[1])
+       && o+sz <= GOF(FPREG[1])+SZB(FPREG[1])) return GOF(FPREG[1]);
+   if (o >= GOF(FPREG[2])
+       && o+sz <= GOF(FPREG[2])+SZB(FPREG[2])) return GOF(FPREG[2]);
+   if (o >= GOF(FPREG[3])
+       && o+sz <= GOF(FPREG[3])+SZB(FPREG[3])) return GOF(FPREG[3]);
+   if (o >= GOF(FPREG[4])
+       && o+sz <= GOF(FPREG[4])+SZB(FPREG[4])) return GOF(FPREG[4]);
+   if (o >= GOF(FPREG[5])
+       && o+sz <= GOF(FPREG[5])+SZB(FPREG[5])) return GOF(FPREG[5]);
+   if (o >= GOF(FPREG[6])
+       && o+sz <= GOF(FPREG[6])+SZB(FPREG[6])) return GOF(FPREG[6]);
+   if (o >= GOF(FPREG[7])
+       && o+sz <= GOF(FPREG[7])+SZB(FPREG[7])) return GOF(FPREG[7]);
+
+   /* skip %GS and other segment related stuff.  We could shadow
+      guest_LDT and guest_GDT, although it seems pointless.
+      guest_CS .. guest_SS are too small to shadow directly and it
+      also seems pointless to shadow them indirectly (that is, in 
+      the style of %AH .. %DH). */
+   if (o == GOF(CS) && sz == 2) return -1;
+   if (o == GOF(DS) && sz == 2) return -1;
+   if (o == GOF(ES) && sz == 2) return -1;
+   if (o == GOF(FS) && sz == 2) return -1;
+   if (o == GOF(GS) && sz == 2) return -1;
+   if (o == GOF(SS) && sz == 2) return -1;
+   if (o == GOF(LDT) && sz == 4) return -1;
+   if (o == GOF(GDT) && sz == 4) return -1;
+
+   VG_(printf)("MC_(get_otrack_shadow_offset)(x86)(off=%d,sz=%d)\n",
+               offset,szB);
+   tl_assert(0);
+#  undef GOF
+#  undef SZB
+
+#  else
+#    error "FIXME: not implemented for this architecture"
+#  endif
+}
+
+
+/* Let 'arr' describe an indexed reference to a guest state section
+   (guest state array).
+
+   This function returns the corresponding guest state type to be used
+   when indexing the corresponding array in the second shadow (origin
+   tracking) area.  If the array is not to be origin-tracked, return
+   Ity_INVALID.
+
+   This function must agree with MC_(get_otrack_shadow_offset) above.
+   See comments at the start of MC_(get_otrack_shadow_offset).
+*/
+IRType MC_(get_otrack_reg_array_equiv_int_type) ( IRRegArray* arr )
+{
+   /* -------------------- ppc64 -------------------- */
+#  if defined(VGA_ppc64)
+   /* The redir stack. */
+   if (arr->base == offsetof(VexGuestPPC64State,guest_REDIR_STACK[0])
+       && arr->elemTy == Ity_I64
+       && arr->nElems == VEX_GUEST_PPC64_REDIR_STACK_SIZE)
+      return Ity_I64;
+
+   VG_(printf)("get_reg_array_equiv_int_type(ppc64): unhandled: ");
+   ppIRRegArray(arr);
+   VG_(printf)("\n");
+   tl_assert(0);
+
+   /* -------------------- ppc32 -------------------- */
+#  elif defined(VGA_ppc32)
+   /* The redir stack. */
+   if (arr->base == offsetof(VexGuestPPC32State,guest_REDIR_STACK[0])
+       && arr->elemTy == Ity_I32
+       && arr->nElems == VEX_GUEST_PPC32_REDIR_STACK_SIZE)
+      return Ity_I32;
+
+   VG_(printf)("get_reg_array_equiv_int_type(ppc32): unhandled: ");
+   ppIRRegArray(arr);
+   VG_(printf)("\n");
+   tl_assert(0);
+
+   /* -------------------- amd64 -------------------- */
+#  elif defined(VGA_amd64)
+   /* Ignore the FP tag array - pointless to shadow, and in any case
+      the elements are too small */
+   if (arr->base == offsetof(VexGuestAMD64State,guest_FPTAG)
+       && arr->elemTy == Ity_I8 && arr->nElems == 8)
+      return Ity_INVALID;
+
+   /* The FP register array */
+   if (arr->base == offsetof(VexGuestAMD64State,guest_FPREG[0])
+       && arr->elemTy == Ity_F64 && arr->nElems == 8)
+      return Ity_I64;
+
+   VG_(printf)("get_reg_array_equiv_int_type(amd64): unhandled: ");
+   ppIRRegArray(arr);
+   VG_(printf)("\n");
+   tl_assert(0);
+
+   /* --------------------- x86 --------------------- */
+#  elif defined(VGA_x86)
+   /* Ignore the FP tag array - pointless to shadow, and in any case
+      the elements are too small */
+   if (arr->base == offsetof(VexGuestX86State,guest_FPTAG)
+       && arr->elemTy == Ity_I8 && arr->nElems == 8)
+      return Ity_INVALID;
+
+   /* The FP register array */
+   if (arr->base == offsetof(VexGuestX86State,guest_FPREG[0])
+       && arr->elemTy == Ity_F64 && arr->nElems == 8)
+      return Ity_I64;
+
+   VG_(printf)("get_reg_array_equiv_int_type(x86): unhandled: ");
+   ppIRRegArray(arr);
+   VG_(printf)("\n");
+   tl_assert(0);
+
+#  else
+#    error "FIXME: not implemented for this architecture"
+#  endif
+}
+
+
+/*--------------------------------------------------------------------*/
+/*--- end                                             mc_machine.c ---*/
+/*--------------------------------------------------------------------*/
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c
index d00bf03..e4b7e13 100644
--- a/memcheck/mc_main.c
+++ b/memcheck/mc_main.c
@@ -55,6 +55,9 @@
 
 #define DEBUG(fmt, args...) //VG_(printf)(fmt, ## args)
 
+static void ocache_sarp_Set_Origins ( Addr, UWord, UInt ); /* fwds */
+static void ocache_sarp_Clear_Origins ( Addr, UWord ); /* fwds */
+
 
 /*------------------------------------------------------------*/
 /*--- Fast-case knobs                                      ---*/
@@ -70,6 +73,11 @@
 #define PERF_FAST_STACK    1
 #define PERF_FAST_STACK2   1
 
+/* Change this to 1 to enable assertions on origin tracking cache fast
+   paths */
+#define OC_ENABLE_ASSERTIONS 0
+
+
 /*------------------------------------------------------------*/
 /*--- V bits and A bits                                    ---*/
 /*------------------------------------------------------------*/
@@ -1133,9 +1141,9 @@
 static void mc_record_address_error  ( ThreadId tid, Addr a,
                                        Int size, Bool isWrite );
 static void mc_record_core_mem_error ( ThreadId tid, Bool isAddrErr, Char* s );
-static void mc_record_regparam_error ( ThreadId tid, Char* msg );
+static void mc_record_regparam_error ( ThreadId tid, Char* msg, UInt otag );
 static void mc_record_memparam_error ( ThreadId tid, Addr a,
-                                       Bool isAddrErr, Char* msg );
+                                       Bool isAddrErr, Char* msg, UInt otag );
 static void mc_record_jump_error     ( ThreadId tid, Addr a );
 
 static
@@ -1554,20 +1562,55 @@
    PROF_EVENT(40, "MC_(make_mem_noaccess)");
    DEBUG("MC_(make_mem_noaccess)(%p, %lu)\n", a, len);
    set_address_range_perms ( a, len, VA_BITS16_NOACCESS, SM_DIST_NOACCESS );
+   if (UNLIKELY( MC_(clo_mc_level) == 3 ))
+      ocache_sarp_Clear_Origins ( a, len );
 }
 
-void MC_(make_mem_undefined) ( Addr a, SizeT len )
+static void make_mem_undefined ( Addr a, SizeT len )
+{
+   PROF_EVENT(41, "make_mem_undefined");
+   DEBUG("make_mem_undefined(%p, %lu)\n", a, len);
+   set_address_range_perms ( a, len, VA_BITS16_UNDEFINED, SM_DIST_UNDEFINED );
+}
+
+void MC_(make_mem_undefined_w_otag) ( Addr a, SizeT len, UInt otag )
 {
    PROF_EVENT(41, "MC_(make_mem_undefined)");
    DEBUG("MC_(make_mem_undefined)(%p, %lu)\n", a, len);
    set_address_range_perms ( a, len, VA_BITS16_UNDEFINED, SM_DIST_UNDEFINED );
+   if (UNLIKELY( MC_(clo_mc_level) == 3 ))
+      ocache_sarp_Set_Origins ( a, len, otag );
 }
 
+static
+void make_mem_undefined_w_tid_and_okind ( Addr a, SizeT len,
+                                          ThreadId tid, UInt okind )
+{
+   UInt        ecu;
+   ExeContext* here;
+   /* VG_(record_ExeContext) checks for validity of tid, and asserts
+      if it is invalid.  So no need to do it here. */
+   tl_assert(okind <= 3);
+   here = VG_(record_ExeContext)( tid, 0/*first_ip_delta*/ );
+   tl_assert(here);
+   ecu = VG_(get_ECU_from_ExeContext)(here);
+   tl_assert(VG_(is_plausible_ECU)(ecu));
+   MC_(make_mem_undefined_w_otag) ( a, len, ecu | okind );
+}
+
+static
+void make_mem_undefined_w_tid ( Addr a, SizeT len, ThreadId tid ) {
+   make_mem_undefined_w_tid_and_okind ( a, len, tid, MC_OKIND_UNKNOWN );
+}
+
+
 void MC_(make_mem_defined) ( Addr a, SizeT len )
 {
    PROF_EVENT(42, "MC_(make_mem_defined)");
    DEBUG("MC_(make_mem_defined)(%p, %lu)\n", a, len);
    set_address_range_perms ( a, len, VA_BITS16_DEFINED, SM_DIST_DEFINED );
+   if (UNLIKELY( MC_(clo_mc_level) == 3 ))
+      ocache_sarp_Clear_Origins ( a, len );
 }
 
 /* For each byte in [a,a+len), if the byte is addressable, make it be
@@ -1583,6 +1626,9 @@
       vabits2 = get_vabits2( a+i );
       if (LIKELY(VA_BITS2_NOACCESS != vabits2)) {
          set_vabits2(a+i, VA_BITS2_DEFINED);
+         if (UNLIKELY(MC_(clo_mc_level) >= 3)) {
+            MC_(helperc_b_store1)( a+i, 0 ); /* clear the origin tag */
+         } 
       }
    }
 }
@@ -1672,10 +1718,436 @@
 }
 
 
-/* --- Fast case permission setters, for dealing with stacks. --- */
+/*------------------------------------------------------------*/
+/*--- Origin tracking stuff - cache basics                 ---*/
+/*------------------------------------------------------------*/
 
-static INLINE
-void make_aligned_word32_undefined ( Addr a )
+/* Some background comments on the origin tracking implementation
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   Note that this implementation draws inspiration from the "origin
+   tracking by value piggybacking" scheme described in "Tracking Bad
+   Apples: Reporting the Origin of Null and Undefined Value Errors"
+   (Michael Bond, Nicholas Nethercote, Stephen Kent, Samuel Guyer,
+   Kathryn McKinley, OOPSLA07, Montreal, Oct 2007) but in fact it is
+   implemented completely differently.
+
+   This implementation tracks the defining point of all values using
+   so called "origin tags", which are 32-bit integers, rather than
+   using the values themselves to encode the origins.  The latter,
+   so-called value piggybacking", is what the OOPSLA07 paper
+   describes.
+
+   Origin tags, as tracked by the machinery below, are 32-bit unsigned
+   ints (UInts), regardless of the machine's word size.
+
+   > Question: why is otag a UInt?  Wouldn't a UWord be better?  Isn't
+   > it really just the address of the relevant ExeContext?
+
+   Well, it's not the address, but a value which has a 1-1 mapping
+   with ExeContexts, and is guaranteed not to be zero, since zero
+   denotes (to memcheck) "unknown origin or defined value".  So these
+   UInts are just numbers starting at 1; each ExeContext is given a
+   number when it is created.
+
+   Making these otags 32-bit regardless of the machine's word size
+   makes the 64-bit implementation easier (next para).  And it doesn't
+   really limit us in any way, since for the tags to overflow would
+   require that the program somehow caused 2^32-1 different
+   ExeContexts to be created, in which case it is probably in deep
+   trouble.  Not to mention V will have soaked up many tens of
+   gigabytes of memory merely to store them all.
+
+   So having 64-bit origins doesn't really buy you anything, and has
+   the following downsides:
+
+   Suppose that instead, an otag is a UWord.  This would mean that, on
+   a 64-bit target,
+
+   1. It becomes hard to shadow any element of guest state which is
+      smaller than 8 bytes.  To do so means you'd need to find some
+      8-byte-sized hole in the guest state which you don't want to
+      shadow, and use that instead to hold the otag.  On ppc64, the
+      condition code register(s) are split into 20 UChar sized pieces,
+      all of which need to be tracked (guest_XER_SO .. guest_CR7_0)
+      and so that would entail finding 160 bytes somewhere else in the
+      guest state.
+
+      Even on x86, I want to track origins for %AH .. %DH (bits 15:8
+      of %EAX .. %EDX) that are separate from %AL .. %DL (bits 7:0 of
+      same) and so I had to look for 4 untracked otag-sized areas in
+      the guest state to make that possible.
+
+      The same problem exists of course when origin tags are only 32
+      bits, but it's less extreme.
+
+   2. (More compelling) it doubles the size of the origin shadow
+      memory.  Given that the shadow memory is organised as a fixed
+      size cache, and that accuracy of tracking is limited by origins
+      falling out the cache due to space conflicts, this isn't good.
+
+   > Another question: is the origin tracking perfect, or are there
+   > cases where it fails to determine an origin?
+
+   It is imperfect for at least for the following reasons, and
+   probably more:
+
+   * Insufficient capacity in the origin cache.  When a line is
+     evicted from the cache it is gone forever, and so subsequent
+     queries for the line produce zero, indicating no origin
+     information.  Interestingly, a line containing all zeroes can be
+     evicted "free" from the cache, since it contains no useful
+     information, so there is scope perhaps for some cleverer cache
+     management schemes.
+
+   * The origin cache only stores one otag per 32-bits of address
+     space, plus 4 bits indicating which of the 4 bytes has that tag
+     and which are considered defined.  The result is that if two
+     undefined bytes in the same word are stored in memory, the first
+     stored byte's origin will be lost and replaced by the origin for
+     the second byte.
+
+   * Nonzero origin tags for defined values.  Consider a binary
+     operator application op(x,y).  Suppose y is undefined (and so has
+     a valid nonzero origin tag), and x is defined, but erroneously
+     has a nonzero origin tag (defined values should have tag zero).
+     If the erroneous tag has a numeric value greater than y's tag,
+     then the rule for propagating origin tags though binary
+     operations, which is simply to take the unsigned max of the two
+     tags, will erroneously propagate x's tag rather than y's.
+
+   * Some obscure uses of x86/amd64 byte registers can cause lossage
+     or confusion of origins.  %AH .. %DH are treated as different
+     from, and unrelated to, their parent registers, %EAX .. %EDX.
+     So some wierd sequences like
+
+        movb undefined-value, %AH
+        movb defined-value, %AL
+        .. use %AX or %EAX ..
+
+     will cause the origin attributed to %AH to be ignored, since %AL,
+     %AX, %EAX are treated as the same register, and %AH as a
+     completely separate one.
+
+   But having said all that, it actually seems to work fairly well in
+   practice.
+*/
+
+static UWord stats_ocacheL1_find           = 0;
+static UWord stats_ocacheL1_found_at_1     = 0;
+static UWord stats_ocacheL1_found_at_N     = 0;
+static UWord stats_ocacheL1_misses         = 0;
+static UWord stats_ocacheL1_lossage        = 0;
+static UWord stats_ocacheL1_movefwds       = 0;
+
+static UWord stats__ocacheL2_refs          = 0;
+static UWord stats__ocacheL2_misses        = 0;
+static UWord stats__ocacheL2_n_nodes_max   = 0;
+
+/* Cache of 32-bit values, one every 32 bits of address space */
+
+#define OC_BITS_PER_LINE 5
+#define OC_W32S_PER_LINE (1 << (OC_BITS_PER_LINE - 2))
+
+static INLINE UWord oc_line_offset ( Addr a ) {
+   return (a >> 2) & (OC_W32S_PER_LINE - 1);
+}
+static INLINE Bool is_valid_oc_tag ( Addr tag ) {
+   return 0 == (tag & ((1 << OC_BITS_PER_LINE) - 1));
+}
+
+#define OC_LINES_PER_SET 2
+
+#define OC_N_SET_BITS    20
+#define OC_N_SETS        (1 << OC_N_SET_BITS)
+
+/* These settings give:
+   64 bit host: ocache:  100,663,296 sizeB    67,108,864 useful
+   32 bit host: ocache:   92,274,688 sizeB    67,108,864 useful
+*/
+
+#define OC_MOVE_FORWARDS_EVERY_BITS 7
+
+
+typedef
+   struct {
+      Addr  tag;
+      UInt  w32[OC_W32S_PER_LINE];
+      UChar descr[OC_W32S_PER_LINE];
+   }
+   OCacheLine;
+
+/* Classify and also sanity-check 'line'.  Return 'e' (empty) if not
+   in use, 'n' (nonzero) if it contains at least one valid origin tag,
+   and 'z' if all the represented tags are zero. */
+static UChar classify_OCacheLine ( OCacheLine* line )
+{
+   UWord i;
+   if (line->tag == 1/*invalid*/)
+      return 'e'; /* EMPTY */
+   tl_assert(is_valid_oc_tag(line->tag));
+   for (i = 0; i < OC_W32S_PER_LINE; i++) {
+      tl_assert(0 == ((~0xF) & line->descr[i]));
+      if (line->w32[i] > 0 && line->descr[i] > 0)
+         return 'n'; /* NONZERO - contains useful info */
+   }
+   return 'z'; /* ZERO - no useful info */
+}
+
+typedef
+   struct {
+      OCacheLine line[OC_LINES_PER_SET];
+   }
+   OCacheSet;
+
+typedef
+   struct {
+      OCacheSet set[OC_N_SETS];
+   }
+   OCache;
+
+static OCache ocache;
+static UWord  ocache_event_ctr = 0;
+
+static void init_ocacheL2 ( void ); /* fwds */
+static void init_OCache ( void )
+{
+   UWord line, set;
+   for (set = 0; set < OC_N_SETS; set++) {
+      for (line = 0; line < OC_LINES_PER_SET; line++) {
+         ocache.set[set].line[line].tag = 1/*invalid*/;
+      }
+   }
+   init_ocacheL2();
+}
+
+static void moveLineForwards ( OCacheSet* set, UWord lineno )
+{
+   OCacheLine tmp;
+   stats_ocacheL1_movefwds++;
+   tl_assert(lineno > 0 && lineno < OC_LINES_PER_SET);
+   tmp = set->line[lineno-1];
+   set->line[lineno-1] = set->line[lineno];
+   set->line[lineno] = tmp;
+}
+
+static void zeroise_OCacheLine ( OCacheLine* line, Addr tag ) {
+   UWord i;
+   for (i = 0; i < OC_W32S_PER_LINE; i++) {
+      line->w32[i] = 0; /* NO ORIGIN */
+      line->descr[i] = 0; /* REALLY REALLY NO ORIGIN! */
+   }
+   line->tag = tag;
+}
+
+//////////////////////////////////////////////////////////////
+//// OCache backing store
+
+static OSet* ocacheL2 = NULL;
+
+static void* ocacheL2_malloc ( SizeT szB ) {
+   return VG_(malloc)(szB);
+}
+static void ocacheL2_free ( void* v ) {
+   VG_(free)( v );
+}
+
+/* Stats: # nodes currently in tree */
+static UWord stats__ocacheL2_n_nodes = 0;
+
+static void init_ocacheL2 ( void )
+{
+   tl_assert(!ocacheL2);
+   tl_assert(sizeof(Word) == sizeof(Addr)); /* since OCacheLine.tag :: Addr */
+   tl_assert(0 == offsetof(OCacheLine,tag));
+   ocacheL2 
+      = VG_(OSetGen_Create)( offsetof(OCacheLine,tag), 
+                             NULL, /* fast cmp */
+                             ocacheL2_malloc, ocacheL2_free );
+   tl_assert(ocacheL2);
+   stats__ocacheL2_n_nodes = 0;
+}
+
+/* Find line with the given tag in the tree, or NULL if not found. */
+static OCacheLine* ocacheL2_find_tag ( Addr tag )
+{
+   OCacheLine* line;
+   tl_assert(is_valid_oc_tag(tag));
+   stats__ocacheL2_refs++;
+   line = VG_(OSetGen_Lookup)( ocacheL2, &tag );
+   return line;
+}
+
+/* Delete the line with the given tag from the tree, if it is present, and
+   free up the associated memory. */
+static void ocacheL2_del_tag ( Addr tag )
+{
+   OCacheLine* line;
+   tl_assert(is_valid_oc_tag(tag));
+   stats__ocacheL2_refs++;
+   line = VG_(OSetGen_Remove)( ocacheL2, &tag );
+   if (line) {
+      VG_(OSetGen_FreeNode)(ocacheL2, line);
+      tl_assert(stats__ocacheL2_n_nodes > 0);
+      stats__ocacheL2_n_nodes--;
+   }
+}
+
+/* Add a copy of the given line to the tree.  It must not already be
+   present. */
+static void ocacheL2_add_line ( OCacheLine* line )
+{
+   OCacheLine* copy;
+   tl_assert(is_valid_oc_tag(line->tag));
+   copy = VG_(OSetGen_AllocNode)( ocacheL2, sizeof(OCacheLine) );
+   tl_assert(copy);
+   *copy = *line;
+   stats__ocacheL2_refs++;
+   VG_(OSetGen_Insert)( ocacheL2, copy );
+   stats__ocacheL2_n_nodes++;
+   if (stats__ocacheL2_n_nodes > stats__ocacheL2_n_nodes_max)
+      stats__ocacheL2_n_nodes_max = stats__ocacheL2_n_nodes;
+}
+
+////
+//////////////////////////////////////////////////////////////
+
+__attribute__((noinline))
+static OCacheLine* find_OCacheLine_SLOW ( Addr a )
+{
+   OCacheLine *victim, *inL2;
+   UChar c;
+   UWord line;
+   UWord setno   = (a >> OC_BITS_PER_LINE) & (OC_N_SETS - 1);
+   UWord tagmask = ~((1 << OC_BITS_PER_LINE) - 1);
+   UWord tag     = a & tagmask;
+   tl_assert(setno >= 0 && setno < OC_N_SETS);
+
+   /* we already tried line == 0; skip therefore. */
+   for (line = 1; line < OC_LINES_PER_SET; line++) {
+      if (ocache.set[setno].line[line].tag == tag) {
+         if (line == 1) {
+            stats_ocacheL1_found_at_1++;
+         } else {
+            stats_ocacheL1_found_at_N++;
+         }
+         if (UNLIKELY(0 == (ocache_event_ctr++ 
+                            & ((1<<OC_MOVE_FORWARDS_EVERY_BITS)-1)))) {
+            moveLineForwards( &ocache.set[setno], line );
+            line--;
+         }
+         return &ocache.set[setno].line[line];
+      }
+   }
+
+   /* A miss.  Use the last slot.  Implicitly this means we're
+      ejecting the line in the last slot. */
+   stats_ocacheL1_misses++;
+   tl_assert(line == OC_LINES_PER_SET);
+   line--;
+   tl_assert(line > 0);
+
+   /* First, move the to-be-ejected line to the L2 cache. */
+   victim = &ocache.set[setno].line[line];
+   c = classify_OCacheLine(victim);
+   switch (c) {
+      case 'e':
+         /* the line is empty (has invalid tag); ignore it. */
+         break;
+      case 'z':
+         /* line contains zeroes.  We must ensure the backing store is
+            updated accordingly, either by copying the line there
+            verbatim, or by ensuring it isn't present there.  We
+            chosse the latter on the basis that it reduces the size of
+            the backing store. */
+         ocacheL2_del_tag( victim->tag );
+         break;
+      case 'n':
+         /* line contains at least one real, useful origin.  Copy it
+            to the backing store. */
+         stats_ocacheL1_lossage++;
+         inL2 = ocacheL2_find_tag( victim->tag );
+         if (inL2) {
+            *inL2 = *victim;
+         } else {
+            ocacheL2_add_line( victim );
+         }
+         break;
+      default:
+         tl_assert(0);
+   }
+
+   /* Now we must reload the L1 cache from the backing tree, if
+      possible. */
+   tl_assert(tag != victim->tag); /* stay sane */
+   inL2 = ocacheL2_find_tag( tag );
+   if (inL2) {
+      /* We're in luck.  It's in the L2. */
+      ocache.set[setno].line[line] = *inL2;
+   } else {
+      /* Missed at both levels of the cache hierarchy.  We have to
+         declare it as full of zeroes (unknown origins). */
+      stats__ocacheL2_misses++;
+      zeroise_OCacheLine( &ocache.set[setno].line[line], tag );
+   }
+
+   /* Move it one forwards */
+   moveLineForwards( &ocache.set[setno], line );
+   line--;
+
+   return &ocache.set[setno].line[line];
+}
+
+static INLINE OCacheLine* find_OCacheLine ( Addr a )
+{
+   UWord setno   = (a >> OC_BITS_PER_LINE) & (OC_N_SETS - 1);
+   UWord tagmask = ~((1 << OC_BITS_PER_LINE) - 1);
+   UWord tag     = a & tagmask;
+
+   stats_ocacheL1_find++;
+
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(setno >= 0 && setno < OC_N_SETS);
+      tl_assert(0 == (tag & (4 * OC_W32S_PER_LINE - 1)));
+   }
+
+   if (LIKELY(ocache.set[setno].line[0].tag == tag)) {
+      return &ocache.set[setno].line[0];
+   }
+
+   return find_OCacheLine_SLOW( a );
+}
+
+static INLINE void set_aligned_word64_Origin_to_undef ( Addr a, UInt otag )
+{
+   //// BEGIN inlined, specialised version of MC_(helperc_b_store8)
+   //// Set the origins for a+0 .. a+7
+   { OCacheLine* line;
+     UWord lineoff = oc_line_offset(a);
+     if (OC_ENABLE_ASSERTIONS) {
+        tl_assert(lineoff >= 0 
+                  && lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/);
+     }
+     line = find_OCacheLine( a );
+     line->descr[lineoff+0] = 0xF;
+     line->descr[lineoff+1] = 0xF;
+     line->w32[lineoff+0]   = otag;
+     line->w32[lineoff+1]   = otag;
+   }
+   //// END inlined, specialised version of MC_(helperc_b_store8)
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Aligned fast case permission setters,                ---*/
+/*--- for dealing with stacks                              ---*/
+/*------------------------------------------------------------*/
+
+/*--------------------- 32-bit ---------------------*/
+
+/* Nb: by "aligned" here we mean 4-byte aligned */
+
+static INLINE void make_aligned_word32_undefined ( Addr a )
 {
    UWord   sm_off;
    SecMap* sm;
@@ -1683,11 +2155,11 @@
    PROF_EVENT(300, "make_aligned_word32_undefined");
 
 #ifndef PERF_FAST_STACK2
-   MC_(make_mem_undefined)(a, 4);
+   make_mem_undefined(a, 4);
 #else
    if (UNLIKELY(a > MAX_PRIMARY_ADDRESS)) {
       PROF_EVENT(301, "make_aligned_word32_undefined-slow1");
-      MC_(make_mem_undefined)(a, 4);
+      make_mem_undefined(a, 4);
       return;
    }
 
@@ -1697,6 +2169,23 @@
 #endif
 }
 
+static INLINE
+void make_aligned_word32_undefined_w_otag ( Addr a, UInt otag )
+{
+   make_aligned_word32_undefined(a);
+   //// BEGIN inlined, specialised version of MC_(helperc_b_store4)
+   //// Set the origins for a+0 .. a+3
+   { OCacheLine* line;
+     UWord lineoff = oc_line_offset(a);
+     if (OC_ENABLE_ASSERTIONS) {
+        tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
+     }
+     line = find_OCacheLine( a );
+     line->descr[lineoff] = 0xF;
+     line->w32[lineoff]   = otag;
+   }
+   //// END inlined, specialised version of MC_(helperc_b_store4)
+}
 
 static INLINE
 void make_aligned_word32_noaccess ( Addr a )
@@ -1718,13 +2207,27 @@
    sm                  = get_secmap_for_writing_low(a);
    sm_off              = SM_OFF(a);
    sm->vabits8[sm_off] = VA_BITS8_NOACCESS;
+
+   //// BEGIN inlined, specialised version of MC_(helperc_b_store4)
+   //// Set the origins for a+0 .. a+3.
+   if (UNLIKELY( MC_(clo_mc_level) == 3 )) {
+      OCacheLine* line;
+      UWord lineoff = oc_line_offset(a);
+      if (OC_ENABLE_ASSERTIONS) {
+         tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
+      }
+      line = find_OCacheLine( a );
+      line->descr[lineoff] = 0;
+   }
+   //// END inlined, specialised version of MC_(helperc_b_store4)
 #endif
 }
 
+/*--------------------- 64-bit ---------------------*/
 
 /* Nb: by "aligned" here we mean 8-byte aligned */
-static INLINE
-void make_aligned_word64_undefined ( Addr a )
+
+static INLINE void make_aligned_word64_undefined ( Addr a )
 {
    UWord   sm_off16;
    SecMap* sm;
@@ -1732,11 +2235,11 @@
    PROF_EVENT(320, "make_aligned_word64_undefined");
 
 #ifndef PERF_FAST_STACK2
-   MC_(make_mem_undefined)(a, 8);
+   make_mem_undefined(a, 8);
 #else
    if (UNLIKELY(a > MAX_PRIMARY_ADDRESS)) {
       PROF_EVENT(321, "make_aligned_word64_undefined-slow1");
-      MC_(make_mem_undefined)(a, 8);
+      make_mem_undefined(a, 8);
       return;
    }
 
@@ -1746,6 +2249,24 @@
 #endif
 }
 
+static INLINE
+void make_aligned_word64_undefined_w_otag ( Addr a, UInt otag )
+{
+   make_aligned_word64_undefined(a);
+   //// BEGIN inlined, specialised version of MC_(helperc_b_store8)
+   //// Set the origins for a+0 .. a+7
+   { OCacheLine* line;
+     UWord lineoff = oc_line_offset(a);
+     tl_assert(lineoff >= 0 
+               && lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/);
+     line = find_OCacheLine( a );
+     line->descr[lineoff+0] = 0xF;
+     line->descr[lineoff+1] = 0xF;
+     line->w32[lineoff+0]   = otag;
+     line->w32[lineoff+1]   = otag;
+   }
+   //// END inlined, specialised version of MC_(helperc_b_store8)
+}
 
 static INLINE
 void make_aligned_word64_noaccess ( Addr a )
@@ -1767,6 +2288,19 @@
    sm       = get_secmap_for_writing_low(a);
    sm_off16 = SM_OFF_16(a);
    ((UShort*)(sm->vabits8))[sm_off16] = VA_BITS16_NOACCESS;
+
+   //// BEGIN inlined, specialised version of MC_(helperc_b_store8)
+   //// Clear the origins for a+0 .. a+7.
+   if (UNLIKELY( MC_(clo_mc_level) == 3 )) {
+      OCacheLine* line;
+      UWord lineoff = oc_line_offset(a);
+      tl_assert(lineoff >= 0 
+                && lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/);
+      line = find_OCacheLine( a );
+      line->descr[lineoff+0] = 0;
+      line->descr[lineoff+1] = 0;
+   }
+   //// END inlined, specialised version of MC_(helperc_b_store8)
 #endif
 }
 
@@ -1775,13 +2309,26 @@
 /*--- Stack pointer adjustment                             ---*/
 /*------------------------------------------------------------*/
 
+/*--------------- adjustment by 4 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_4_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(110, "new_mem_stack_4");
+   if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 4, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_4(Addr new_SP)
 {
    PROF_EVENT(110, "new_mem_stack_4");
    if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
       make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 4 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 4 );
    }
 }
 
@@ -1795,16 +2342,32 @@
    }
 }
 
+/*--------------- adjustment by 8 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_8_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(111, "new_mem_stack_8");
+   if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP, otag );
+   } else if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP  , otag );
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+4, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 8, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_8(Addr new_SP)
 {
    PROF_EVENT(111, "new_mem_stack_8");
    if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
    } else if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
-      make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP   );
+      make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
       make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP+4 );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 8 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 8 );
    }
 }
 
@@ -1821,20 +2384,40 @@
    }
 }
 
+/*--------------- adjustment by 12 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_12_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(112, "new_mem_stack_12");
+   if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP  , otag );
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+8, otag );
+   } else if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      /* from previous test we don't have 8-alignment at offset +0,
+         hence must have 8 alignment at offsets +4/-4.  Hence safe to
+         do 4 at +0 and then 8 at +4/. */
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP  , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+4, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 12, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_12(Addr new_SP)
 {
    PROF_EVENT(112, "new_mem_stack_12");
    if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP   );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
       make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8 );
    } else if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
       /* from previous test we don't have 8-alignment at offset +0,
          hence must have 8 alignment at offsets +4/-4.  Hence safe to
          do 4 at +0 and then 8 at +4/. */
-      make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP   );
+      make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+4 );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 12 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 12 );
    }
 }
 
@@ -1858,21 +2441,42 @@
    }
 }
 
+/*--------------- adjustment by 16 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_16_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(113, "new_mem_stack_16");
+   if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      /* Have 8-alignment at +0, hence do 8 at +0 and 8 at +8. */
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP  , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+8, otag );
+   } else if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      /* Have 4 alignment at +0 but not 8; hence 8 must be at +4.
+         Hence do 4 at +0, 8 at +4, 4 at +12. */
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP   , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+4 , otag );
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+12, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 16, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_16(Addr new_SP)
 {
    PROF_EVENT(113, "new_mem_stack_16");
    if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
       /* Have 8-alignment at +0, hence do 8 at +0 and 8 at +8. */
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP   );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8 );
    } else if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
       /* Have 4 alignment at +0 but not 8; hence 8 must be at +4.
          Hence do 4 at +0, 8 at +4, 4 at +12. */
-      make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP    );
+      make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+4  );
       make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP+12 );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 16 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 16 );
    }
 }
 
@@ -1893,25 +2497,50 @@
    }
 }
 
+/*--------------- adjustment by 32 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_32_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(114, "new_mem_stack_32");
+   if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      /* Straightforward */
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP   , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+8 , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+16, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+24, otag );
+   } else if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      /* 8 alignment must be at +4.  Hence do 8 at +4,+12,+20 and 4 at
+         +0,+28. */
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP   , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+4 , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+12, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+20, otag );
+      make_aligned_word32_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+28, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 32, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_32(Addr new_SP)
 {
    PROF_EVENT(114, "new_mem_stack_32");
    if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
       /* Straightforward */
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP    );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8  );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+16 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+24 );
    } else if (VG_IS_4_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
       /* 8 alignment must be at +4.  Hence do 8 at +4,+12,+20 and 4 at
          +0,+28. */
-      make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP    );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+4  );
+      make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+4 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+12 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+20 );
       make_aligned_word32_undefined ( -VG_STACK_REDZONE_SZB + new_SP+28 );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 32 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 32 );
    }
 }
 
@@ -1937,12 +2566,38 @@
    }
 }
 
+/*--------------- adjustment by 112 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_112_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(115, "new_mem_stack_112");
+   if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP   , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+8 , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+16, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+24, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+32, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+40, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+48, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+56, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+64, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+72, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+80, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+88, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+96, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+104, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 112, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_112(Addr new_SP)
 {
    PROF_EVENT(115, "new_mem_stack_112");
    if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP    );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8  );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+16 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+24 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+32 );
@@ -1954,9 +2609,9 @@
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+80 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+88 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+96 );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+104);
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+104 );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 112 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 112 );
    }
 }
 
@@ -1983,12 +2638,40 @@
    }
 }
 
+/*--------------- adjustment by 128 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_128_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(116, "new_mem_stack_128");
+   if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP   , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+8 , otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+16, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+24, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+32, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+40, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+48, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+56, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+64, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+72, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+80, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+88, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+96, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+104, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+112, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+120, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 128, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_128(Addr new_SP)
 {
    PROF_EVENT(116, "new_mem_stack_128");
    if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP    );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8  );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+16 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+24 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+32 );
@@ -2000,11 +2683,11 @@
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+80 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+88 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+96 );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+104);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+112);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+120);
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+104 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+112 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+120 );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 128 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 128 );
    }
 }
 
@@ -2033,12 +2716,42 @@
    }
 }
 
+/*--------------- adjustment by 144 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_144_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(117, "new_mem_stack_144");
+   if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP,     otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+8,   otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+16,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+24,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+32,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+40,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+48,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+56,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+64,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+72,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+80,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+88,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+96,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+104, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+112, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+120, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+128, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+136, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 144, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_144(Addr new_SP)
 {
    PROF_EVENT(117, "new_mem_stack_144");
    if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP    );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8  );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+16 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+24 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+32 );
@@ -2050,13 +2763,13 @@
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+80 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+88 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+96 );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+104);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+112);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+120);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+128);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+136);
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+104 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+112 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+120 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+128 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+136 );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 144 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 144 );
    }
 }
 
@@ -2087,12 +2800,44 @@
    }
 }
 
+/*--------------- adjustment by 160 bytes ---------------*/
+
+static void VG_REGPARM(2) mc_new_mem_stack_160_w_ECU(Addr new_SP, UInt ecu)
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(118, "new_mem_stack_160");
+   if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP,     otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+8,   otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+16,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+24,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+32,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+40,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+48,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+56,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+64,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+72,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+80,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+88,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+96,  otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+104, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+112, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+120, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+128, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+136, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+144, otag );
+      make_aligned_word64_undefined_w_otag ( -VG_STACK_REDZONE_SZB + new_SP+152, otag );
+   } else {
+      MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + new_SP, 160, otag );
+   }
+}
+
 static void VG_REGPARM(1) mc_new_mem_stack_160(Addr new_SP)
 {
    PROF_EVENT(118, "new_mem_stack_160");
    if (VG_IS_8_ALIGNED( -VG_STACK_REDZONE_SZB + new_SP )) {
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP    );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8  );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+8 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+16 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+24 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+32 );
@@ -2104,15 +2849,15 @@
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+80 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+88 );
       make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+96 );
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+104);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+112);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+120);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+128);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+136);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+144);
-      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+152);
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+104 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+112 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+120 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+128 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+136 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+144 );
+      make_aligned_word64_undefined ( -VG_STACK_REDZONE_SZB + new_SP+152 );
    } else {
-      MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + new_SP, 160 );
+      make_mem_undefined ( -VG_STACK_REDZONE_SZB + new_SP, 160 );
    }
 }
 
@@ -2145,10 +2890,19 @@
    }
 }
 
+/*--------------- adjustment by N bytes ---------------*/
+
+static void mc_new_mem_stack_w_ECU ( Addr a, SizeT len, UInt ecu )
+{
+   UInt otag = ecu | MC_OKIND_STACK;
+   PROF_EVENT(115, "new_mem_stack_w_otag");
+   MC_(make_mem_undefined_w_otag) ( -VG_STACK_REDZONE_SZB + a, len, otag );
+}
+
 static void mc_new_mem_stack ( Addr a, SizeT len )
 {
    PROF_EVENT(115, "new_mem_stack");
-   MC_(make_mem_undefined) ( -VG_STACK_REDZONE_SZB + a, len );
+   make_mem_undefined ( -VG_STACK_REDZONE_SZB + a, len );
 }
 
 static void mc_die_mem_stack ( Addr a, SizeT len )
@@ -2186,42 +2940,133 @@
    with defined values and g could mistakenly read them.  So the RZ
    also needs to be nuked on function calls.
 */
-void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len )
+
+
+/* Here's a simple cache to hold nia -> ECU mappings.  It could be
+   improved so as to have a lower miss rate. */
+
+static UWord stats__nia_cache_queries = 0;
+static UWord stats__nia_cache_misses  = 0;
+
+typedef
+   struct { UWord nia0; UWord ecu0;   /* nia0 maps to ecu0 */
+            UWord nia1; UWord ecu1; } /* nia1 maps to ecu1 */
+   WCacheEnt;
+
+#define N_NIA_TO_ECU_CACHE 511
+
+static WCacheEnt nia_to_ecu_cache[N_NIA_TO_ECU_CACHE];
+
+static void init_nia_to_ecu_cache ( void )
 {
+   UWord       i;
+   Addr        zero_addr = 0;
+   ExeContext* zero_ec;
+   UInt        zero_ecu;
+   /* Fill all the slots with an entry for address zero, and the
+      relevant otags accordingly.  Hence the cache is initially filled
+      with valid data. */
+   zero_ec = VG_(make_depth_1_ExeContext_from_Addr)(zero_addr);
+   tl_assert(zero_ec);
+   zero_ecu = VG_(get_ECU_from_ExeContext)(zero_ec);
+   tl_assert(VG_(is_plausible_ECU)(zero_ecu));
+   for (i = 0; i < N_NIA_TO_ECU_CACHE; i++) {
+      nia_to_ecu_cache[i].nia0 = zero_addr;
+      nia_to_ecu_cache[i].ecu0 = zero_ecu;
+      nia_to_ecu_cache[i].nia1 = zero_addr;
+      nia_to_ecu_cache[i].ecu1 = zero_ecu;
+   }
+}
+
+static inline UInt convert_nia_to_ecu ( Addr nia )
+{
+   UWord i;
+   UInt        ecu;
+   ExeContext* ec;
+
+   tl_assert( sizeof(nia_to_ecu_cache[0].nia1) == sizeof(nia) );
+
+   stats__nia_cache_queries++;
+   i = nia % N_NIA_TO_ECU_CACHE;
+   tl_assert(i >= 0 && i < N_NIA_TO_ECU_CACHE);
+
+   if (LIKELY( nia_to_ecu_cache[i].nia0 == nia ))
+      return nia_to_ecu_cache[i].ecu0;
+
+   if (LIKELY( nia_to_ecu_cache[i].nia1 == nia )) {
+#     define SWAP(_w1,_w2) { UWord _t = _w1; _w1 = _w2; _w2 = _t; }
+      SWAP( nia_to_ecu_cache[i].nia0, nia_to_ecu_cache[i].nia1 );
+      SWAP( nia_to_ecu_cache[i].ecu0, nia_to_ecu_cache[i].ecu1 );
+#     undef SWAP
+      return nia_to_ecu_cache[i].ecu0;
+   }
+
+   stats__nia_cache_misses++;
+   ec = VG_(make_depth_1_ExeContext_from_Addr)(nia);
+   tl_assert(ec);
+   ecu = VG_(get_ECU_from_ExeContext)(ec);
+   tl_assert(VG_(is_plausible_ECU)(ecu));
+
+   nia_to_ecu_cache[i].nia1 = nia_to_ecu_cache[i].nia0;
+   nia_to_ecu_cache[i].ecu1 = nia_to_ecu_cache[i].ecu0;
+
+   nia_to_ecu_cache[i].nia0 = nia;
+   nia_to_ecu_cache[i].ecu0 = (UWord)ecu;
+   return ecu;
+}
+
+
+/* Note that this serves both the origin-tracking and
+   no-origin-tracking modes.  We assume that calls to it are
+   sufficiently infrequent that it isn't worth specialising for the
+   with/without origin-tracking cases. */
+void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len, Addr nia )
+{
+   UInt otag;
    tl_assert(sizeof(UWord) == sizeof(SizeT));
    if (0)
-      VG_(printf)("helperc_MAKE_STACK_UNINIT %p %lu\n", base, len );
+      VG_(printf)("helperc_MAKE_STACK_UNINIT (%p,%lu,nia=%p)\n",
+                  base, len, nia );
+
+   if (UNLIKELY( MC_(clo_mc_level) == 3 )) {
+      UInt ecu = convert_nia_to_ecu ( nia );
+      tl_assert(VG_(is_plausible_ECU)(ecu));
+      otag = ecu | MC_OKIND_STACK;
+   } else {
+      tl_assert(nia == 0);
+      otag = 0;
+   }
 
 #  if 0
    /* Really slow version */
-   MC_(make_mem_undefined)(base, len);
+   MC_(make_mem_undefined)(base, len, otag);
 #  endif
 
 #  if 0
    /* Slow(ish) version, which is fairly easily seen to be correct.
    */
    if (LIKELY( VG_IS_8_ALIGNED(base) && len==128 )) {
-      make_aligned_word64_undefined(base +   0);
-      make_aligned_word64_undefined(base +   8);
-      make_aligned_word64_undefined(base +  16);
-      make_aligned_word64_undefined(base +  24);
+      make_aligned_word64_undefined(base +   0, otag);
+      make_aligned_word64_undefined(base +   8, otag);
+      make_aligned_word64_undefined(base +  16, otag);
+      make_aligned_word64_undefined(base +  24, otag);
 
-      make_aligned_word64_undefined(base +  32);
-      make_aligned_word64_undefined(base +  40);
-      make_aligned_word64_undefined(base +  48);
-      make_aligned_word64_undefined(base +  56);
+      make_aligned_word64_undefined(base +  32, otag);
+      make_aligned_word64_undefined(base +  40, otag);
+      make_aligned_word64_undefined(base +  48, otag);
+      make_aligned_word64_undefined(base +  56, otag);
 
-      make_aligned_word64_undefined(base +  64);
-      make_aligned_word64_undefined(base +  72);
-      make_aligned_word64_undefined(base +  80);
-      make_aligned_word64_undefined(base +  88);
+      make_aligned_word64_undefined(base +  64, otag);
+      make_aligned_word64_undefined(base +  72, otag);
+      make_aligned_word64_undefined(base +  80, otag);
+      make_aligned_word64_undefined(base +  88, otag);
 
-      make_aligned_word64_undefined(base +  96);
-      make_aligned_word64_undefined(base + 104);
-      make_aligned_word64_undefined(base + 112);
-      make_aligned_word64_undefined(base + 120);
+      make_aligned_word64_undefined(base +  96, otag);
+      make_aligned_word64_undefined(base + 104, otag);
+      make_aligned_word64_undefined(base + 112, otag);
+      make_aligned_word64_undefined(base + 120, otag);
    } else {
-      MC_(make_mem_undefined)(base, len);
+      MC_(make_mem_undefined)(base, len, otag);
    }
 #  endif 
 
@@ -2233,6 +3078,7 @@
       directly into the vabits array.  (If the sm was distinguished, this
       will make a copy and then write to it.)
    */
+
    if (LIKELY( len == 128 && VG_IS_8_ALIGNED(base) )) {
       /* Now we know the address range is suitably sized and aligned. */
       UWord a_lo = (UWord)(base);
@@ -2265,6 +3111,24 @@
             p[13] = VA_BITS16_UNDEFINED;
             p[14] = VA_BITS16_UNDEFINED;
             p[15] = VA_BITS16_UNDEFINED;
+            if (UNLIKELY( MC_(clo_mc_level) == 3 )) {
+               set_aligned_word64_Origin_to_undef( base + 8 * 0, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 1, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 2, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 3, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 4, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 5, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 6, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 7, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 8, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 9, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 10, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 11, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 12, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 13, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 14, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 15, otag );
+            }
             return;
          }
       }
@@ -2323,13 +3187,51 @@
             p[33] = VA_BITS16_UNDEFINED;
             p[34] = VA_BITS16_UNDEFINED;
             p[35] = VA_BITS16_UNDEFINED;
+            if (UNLIKELY( MC_(clo_mc_level) == 3 )) {
+               set_aligned_word64_Origin_to_undef( base + 8 * 0, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 1, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 2, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 3, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 4, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 5, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 6, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 7, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 8, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 9, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 10, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 11, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 12, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 13, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 14, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 15, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 16, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 17, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 18, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 19, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 20, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 21, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 22, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 23, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 24, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 25, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 26, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 27, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 28, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 29, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 30, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 31, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 32, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 33, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 34, otag );
+               set_aligned_word64_Origin_to_undef( base + 8 * 35, otag );
+            }
             return;
          }
       }
    }
 
    /* else fall into slow case */
-   MC_(make_mem_undefined)(base, len);
+   MC_(make_mem_undefined_w_otag)(base, len, otag);
 }
 
 
@@ -2372,7 +3274,8 @@
    return True;
 }
 
-static Bool is_mem_addressable ( Addr a, SizeT len, Addr* bad_addr )
+static Bool is_mem_addressable ( Addr a, SizeT len, 
+                                 /*OUT*/Addr* bad_addr )
 {
    SizeT i;
    UWord vabits2;
@@ -2390,13 +3293,18 @@
    return True;
 }
 
-static MC_ReadResult is_mem_defined ( Addr a, SizeT len, Addr* bad_addr )
+static MC_ReadResult is_mem_defined ( Addr a, SizeT len,
+                                      /*OUT*/Addr* bad_addr,
+                                      /*OUT*/UInt* otag )
 {
    SizeT i;
    UWord vabits2;
 
    PROF_EVENT(64, "is_mem_defined");
    DEBUG("is_mem_defined\n");
+
+   if (otag)     *otag = 0;
+   if (bad_addr) *bad_addr = 0;
    for (i = 0; i < len; i++) {
       PROF_EVENT(65, "is_mem_defined(loop)");
       vabits2 = get_vabits2(a);
@@ -2404,9 +3312,18 @@
          // Error!  Nb: Report addressability errors in preference to
          // definedness errors.  And don't report definedeness errors unless
          // --undef-value-errors=yes.
-         if (bad_addr != NULL) *bad_addr = a;
-         if      ( VA_BITS2_NOACCESS == vabits2 ) return MC_AddrErr; 
-         else if ( MC_(clo_undef_value_errors)  ) return MC_ValueErr;
+         if (bad_addr) {
+            *bad_addr = a;
+         }
+         if (VA_BITS2_NOACCESS == vabits2) {
+            return MC_AddrErr;
+         }
+         if (MC_(clo_mc_level) >= 2) {
+            if (otag && MC_(clo_mc_level) == 3) {
+               *otag = MC_(helperc_b_load1)( a );
+            }
+            return MC_ValueErr;
+         }
       }
       a++;
    }
@@ -2418,12 +3335,15 @@
    examine the actual bytes, to find the end, until we're sure it is
    safe to do so. */
 
-static Bool mc_is_defined_asciiz ( Addr a, Addr* bad_addr )
+static Bool mc_is_defined_asciiz ( Addr a, Addr* bad_addr, UInt* otag )
 {
    UWord vabits2;
 
    PROF_EVENT(66, "mc_is_defined_asciiz");
    DEBUG("mc_is_defined_asciiz\n");
+
+   if (otag)     *otag = 0;
+   if (bad_addr) *bad_addr = 0;
    while (True) {
       PROF_EVENT(67, "mc_is_defined_asciiz(loop)");
       vabits2 = get_vabits2(a);
@@ -2431,9 +3351,18 @@
          // Error!  Nb: Report addressability errors in preference to
          // definedness errors.  And don't report definedeness errors unless
          // --undef-value-errors=yes.
-         if (bad_addr != NULL) *bad_addr = a;
-         if      ( VA_BITS2_NOACCESS == vabits2 ) return MC_AddrErr; 
-         else if ( MC_(clo_undef_value_errors)  ) return MC_ValueErr;
+         if (bad_addr) {
+            *bad_addr = a;
+         }
+         if (VA_BITS2_NOACCESS == vabits2) {
+            return MC_AddrErr;
+         }
+         if (MC_(clo_mc_level) >= 2) {
+            if (otag && MC_(clo_mc_level) == 3) {
+               *otag = MC_(helperc_b_load1)( a );
+            }
+            return MC_ValueErr;
+         }
       }
       /* Ok, a is safe to read. */
       if (* ((UChar*)a) == 0) {
@@ -2458,7 +3387,8 @@
    if (!ok) {
       switch (part) {
       case Vg_CoreSysCall:
-         mc_record_memparam_error ( tid, bad_addr, /*isAddrErr*/True, s );
+         mc_record_memparam_error ( tid, bad_addr, 
+                                    /*isAddrErr*/True, s, 0/*otag*/ );
          break;
 
       case Vg_CoreSignal:
@@ -2475,15 +3405,17 @@
 void check_mem_is_defined ( CorePart part, ThreadId tid, Char* s,
                             Addr base, SizeT size )
 {     
+   UInt otag = 0;
    Addr bad_addr;
-   MC_ReadResult res = is_mem_defined ( base, size, &bad_addr );
+   MC_ReadResult res = is_mem_defined ( base, size, &bad_addr, &otag );
 
    if (MC_Ok != res) {
       Bool isAddrErr = ( MC_AddrErr == res ? True : False );
 
       switch (part) {
       case Vg_CoreSysCall:
-         mc_record_memparam_error ( tid, bad_addr, isAddrErr, s );
+         mc_record_memparam_error ( tid, bad_addr, isAddrErr, s,
+                                    isAddrErr ? 0 : otag );
          break;
       
       /* If we're being asked to jump to a silly address, record an error 
@@ -2504,12 +3436,14 @@
 {
    MC_ReadResult res;
    Addr bad_addr = 0;   // shut GCC up
+   UInt otag = 0;
 
    tl_assert(part == Vg_CoreSysCall);
-   res = mc_is_defined_asciiz ( (Addr)str, &bad_addr );
+   res = mc_is_defined_asciiz ( (Addr)str, &bad_addr, &otag );
    if (MC_Ok != res) {
       Bool isAddrErr = ( MC_AddrErr == res ? True : False );
-      mc_record_memparam_error ( tid, bad_addr, isAddrErr, s );
+      mc_record_memparam_error ( tid, bad_addr, isAddrErr, s,
+                                 isAddrErr ? 0 : otag );
    }
 }
 
@@ -2549,6 +3483,30 @@
 /*--- Register event handlers                              ---*/
 /*------------------------------------------------------------*/
 
+/* Try and get a nonzero origin for the guest state section of thread
+   tid characterised by (offset,size).  Return 0 if nothing to show
+   for it. */
+static UInt mb_get_origin_for_guest_offset ( ThreadId tid,
+                                             Int offset, SizeT size )
+{
+   Int   sh2off;
+   UChar area[6];
+   UInt  otag;
+   sh2off = MC_(get_otrack_shadow_offset)( offset, size );
+   if (sh2off == -1)
+      return 0;  /* This piece of guest state is not tracked */
+   tl_assert(sh2off >= 0);
+   tl_assert(0 == (sh2off % 4));
+   area[0] = 0x31;
+   area[5] = 0x27;
+   VG_(get_shadow_regs_area)( tid, &area[1], 2/*shadowno*/,sh2off,4 );
+   tl_assert(area[0] == 0x31);
+   tl_assert(area[5] == 0x27);
+   otag = *(UInt*)&area[1];
+   return otag;
+}
+
+
 /* When some chunk of guest state is written, mark the corresponding
    shadow area as valid.  This is used to initialise arbitrarily large
    chunks of guest state, hence the _SIZE value, which has to be as
@@ -2561,7 +3519,7 @@
    UChar area[MAX_REG_WRITE_SIZE];
    tl_assert(size <= MAX_REG_WRITE_SIZE);
    VG_(memset)(area, V_BITS8_DEFINED, size);
-   VG_(set_shadow_regs_area)( tid, offset, size, area );
+   VG_(set_shadow_regs_area)( tid, 1/*shadowNo*/,offset,size, area );
 #  undef MAX_REG_WRITE_SIZE
 }
 
@@ -2582,11 +3540,12 @@
 {
    Int   i;
    Bool  bad;
+   UInt  otag;
 
    UChar area[16];
    tl_assert(size <= 16);
 
-   VG_(get_shadow_regs_area)( tid, offset, size, area );
+   VG_(get_shadow_regs_area)( tid, area, 1/*shadowNo*/,offset,size );
 
    bad = False;
    for (i = 0; i < size; i++) {
@@ -2596,8 +3555,13 @@
       }
    }
 
-   if (bad)
-      mc_record_regparam_error ( tid, s );
+   if (!bad)
+      return;
+
+   /* We've found some undefinedness.  See if we can also find an
+      origin for it. */
+   otag = mb_get_origin_for_guest_offset( tid, offset, size );
+   mc_record_regparam_error ( tid, s, otag );
 }
 
 
@@ -2605,6 +3569,13 @@
 /*--- Error types                                          ---*/
 /*------------------------------------------------------------*/
 
+/* Did we show to the user, any errors for which an uninitialised
+   value origin could have been collected (but wasn't) ?  If yes,
+   then, at the end of the run, print a 1 line message advising that a
+   rerun with --track-origins=yes might help. */
+static Bool any_value_errors = False;
+
+
 // Different kinds of blocks.
 typedef enum {
    Block_Mallocd = 111,
@@ -2714,11 +3685,17 @@
       // - as a pointer in a load or store
       // - as a jump target
       struct {
-         SizeT szB;     // size of value in bytes
+         SizeT szB;   // size of value in bytes
+         // Origin info
+         UInt        otag;      // origin tag
+         ExeContext* origin_ec; // filled in later
       } Value;
 
       // Use of an undefined value in a conditional branch or move.
       struct {
+         // Origin info
+         UInt        otag;      // origin tag
+         ExeContext* origin_ec; // filled in later
       } Cond;
 
       // Addressability error in core (signal-handling) operation.
@@ -2742,18 +3719,27 @@
 
       // System call register input contains undefined bytes.
       struct {
+         // Origin info
+         UInt        otag;      // origin tag
+         ExeContext* origin_ec; // filled in later
       } RegParam;
 
       // System call memory input contains undefined/unaddressable bytes
       struct {
          Bool     isAddrErr;  // Addressability or definedness error?
          AddrInfo ai;
+         // Origin info
+         UInt        otag;      // origin tag
+         ExeContext* origin_ec; // filled in later
       } MemParam;
 
       // Problem found from a client request like CHECK_MEM_IS_ADDRESSABLE.
       struct {
          Bool     isAddrErr;  // Addressability or definedness error?
          AddrInfo ai;
+         // Origin info
+         UInt        otag;      // origin tag
+         ExeContext* origin_ec; // filled in later
       } User;
 
       // Program tried to free() something that's not a heap block (this
@@ -2927,6 +3913,33 @@
    VG_(pp_ExeContext)( VG_(get_error_where)(err) );
 }
 
+static void mc_pp_origin ( ExeContext* ec, UInt okind )
+{
+   HChar* src   = NULL;
+   HChar* xpre  = VG_(clo_xml) ? "  <what>" : " ";
+   HChar* xpost = VG_(clo_xml) ? "</what>"  : "";
+   tl_assert(ec);
+
+   switch (okind) {
+      case MC_OKIND_STACK:   src = " by a stack allocation"; break;
+      case MC_OKIND_HEAP:    src = " by a heap allocation"; break;
+      case MC_OKIND_USER:    src = " by a client request"; break;
+      case MC_OKIND_UNKNOWN: src = ""; break;
+   }
+   tl_assert(src); /* guards against invalid 'okind' */
+
+   if (VG_(clo_xml)) {
+      VG_(message)(Vg_UserMsg, "  <origin>");
+   }
+
+   VG_(message)(Vg_UserMsg, "%sUninitialised value was created%s%s",
+                            xpre, src, xpost);
+   VG_(pp_ExeContext)( ec );
+   if (VG_(clo_xml)) {
+      VG_(message)(Vg_UserMsg, "  </origin>");
+   }
+}
+
 static void mc_pp_Error ( Error* err )
 {
    MC_Error* extra = VG_(get_error_extra)(err);
@@ -2943,24 +3956,51 @@
       } 
       
       case Err_Value:
-         mc_pp_msg("UninitValue", err,
-                   "Use of uninitialised value of size %d",
-                   extra->Err.Value.szB);
+         any_value_errors = True;
+         if (1 || extra->Err.Value.otag == 0) {
+            mc_pp_msg("UninitValue", err,
+                      "Use of uninitialised value of size %d",
+                      extra->Err.Value.szB);
+         } else {
+            mc_pp_msg("UninitValue", err,
+                      "Use of uninitialised value of size %d (otag %u)",
+                      extra->Err.Value.szB, extra->Err.Value.otag);
+         }
+         if (extra->Err.Value.origin_ec)
+            mc_pp_origin( extra->Err.Value.origin_ec,
+                          extra->Err.Value.otag & 3 );
          break;
 
       case Err_Cond:
-         mc_pp_msg("UninitCondition", err,
-                   "Conditional jump or move depends"
-                   " on uninitialised value(s)");
+         any_value_errors = True;
+         if (1 || extra->Err.Cond.otag == 0) {
+            mc_pp_msg("UninitCondition", err,
+                      "Conditional jump or move depends"
+                      " on uninitialised value(s)");
+         } else {
+            mc_pp_msg("UninitCondition", err,
+                      "Conditional jump or move depends"
+                      " on uninitialised value(s) (otag %u)",
+                      extra->Err.Cond.otag);
+         }
+         if (extra->Err.Cond.origin_ec)
+            mc_pp_origin( extra->Err.Cond.origin_ec,
+                          extra->Err.Cond.otag & 3 );
          break;
 
       case Err_RegParam:
+         any_value_errors = True;
          mc_pp_msg("SyscallParam", err,
                    "Syscall param %s contains uninitialised byte(s)",
                    VG_(get_error_string)(err));
+         if (extra->Err.RegParam.origin_ec)
+            mc_pp_origin( extra->Err.RegParam.origin_ec,
+                          extra->Err.RegParam.otag & 3 );
          break;
 
       case Err_MemParam:
+         if (!extra->Err.MemParam.isAddrErr)
+            any_value_errors = True;
          mc_pp_msg("SyscallParam", err,
                    "Syscall param %s points to %s byte(s)",
                    VG_(get_error_string)(err),
@@ -2968,15 +4008,23 @@
                      ? "unaddressable" : "uninitialised" ));
          mc_pp_AddrInfo(VG_(get_error_address)(err),
                         &extra->Err.MemParam.ai, False);
+         if (extra->Err.MemParam.origin_ec && !extra->Err.MemParam.isAddrErr)
+            mc_pp_origin( extra->Err.MemParam.origin_ec,
+                          extra->Err.MemParam.otag & 3 );
          break;
 
       case Err_User:
+         if (!extra->Err.User.isAddrErr)
+            any_value_errors = True;
          mc_pp_msg("ClientCheck", err,
                    "%s byte(s) found during client check request", 
                    ( extra->Err.User.isAddrErr
                      ? "Unaddressable" : "Uninitialised" ));
          mc_pp_AddrInfo(VG_(get_error_address)(err), &extra->Err.User.ai,
                         False);
+         if (extra->Err.User.origin_ec && !extra->Err.User.isAddrErr)
+            mc_pp_origin( extra->Err.User.origin_ec,
+                          extra->Err.User.otag & 3 );
          break;
 
       case Err_Free:
@@ -3165,18 +4213,27 @@
    VG_(maybe_record_error)( tid, Err_Addr, a, /*s*/NULL, &extra );
 }
 
-static void mc_record_value_error ( ThreadId tid, Int szB )
+static void mc_record_value_error ( ThreadId tid, Int szB, UInt otag )
 {
    MC_Error extra;
-   tl_assert(MC_(clo_undef_value_errors));
-   extra.Err.Value.szB = szB;
+   tl_assert( MC_(clo_mc_level) >= 2 );
+   if (otag > 0)
+      tl_assert( MC_(clo_mc_level) == 3 );
+   extra.Err.Value.szB       = szB;
+   extra.Err.Value.otag      = otag;
+   extra.Err.Value.origin_ec = NULL;  /* Filled in later */
    VG_(maybe_record_error)( tid, Err_Value, /*addr*/0, /*s*/NULL, &extra );
 }
 
-static void mc_record_cond_error ( ThreadId tid )
+static void mc_record_cond_error ( ThreadId tid, UInt otag )
 {
-   tl_assert(MC_(clo_undef_value_errors));
-   VG_(maybe_record_error)( tid, Err_Cond, /*addr*/0, /*s*/NULL, /*extra*/NULL);
+   MC_Error extra;
+   tl_assert( MC_(clo_mc_level) >= 2 );
+   if (otag > 0)
+      tl_assert( MC_(clo_mc_level) == 3 );
+   extra.Err.Cond.otag      = otag;
+   extra.Err.Cond.origin_ec = NULL;  /* Filled in later */
+   VG_(maybe_record_error)( tid, Err_Cond, /*addr*/0, /*s*/NULL, &extra );
 }
 
 /* --- Called from non-generated code --- */
@@ -3188,21 +4245,32 @@
    VG_(maybe_record_error)( tid, Err_CoreMem, /*addr*/0, msg, /*extra*/NULL );
 }
 
-static void mc_record_regparam_error ( ThreadId tid, Char* msg )
+static void mc_record_regparam_error ( ThreadId tid, Char* msg, UInt otag )
 {
+   MC_Error extra;
    tl_assert(VG_INVALID_THREADID != tid);
-   VG_(maybe_record_error)( tid, Err_RegParam, /*addr*/0, msg, /*extra*/NULL );
+   if (otag > 0)
+      tl_assert( MC_(clo_mc_level) == 3 );
+   extra.Err.RegParam.otag      = otag;
+   extra.Err.RegParam.origin_ec = NULL;  /* Filled in later */
+   VG_(maybe_record_error)( tid, Err_RegParam, /*addr*/0, msg, &extra );
 }
 
 static void mc_record_memparam_error ( ThreadId tid, Addr a, 
-                                       Bool isAddrErr, Char* msg )
+                                       Bool isAddrErr, Char* msg, UInt otag )
 {
    MC_Error extra;
    tl_assert(VG_INVALID_THREADID != tid);
    if (!isAddrErr) 
-      tl_assert(MC_(clo_undef_value_errors));
+      tl_assert( MC_(clo_mc_level) >= 2 );
+   if (otag != 0) {
+      tl_assert( MC_(clo_mc_level) == 3 );
+      tl_assert( !isAddrErr );
+   }
    extra.Err.MemParam.isAddrErr = isAddrErr;
    extra.Err.MemParam.ai.tag    = Addr_Undescribed;
+   extra.Err.MemParam.otag      = otag;
+   extra.Err.MemParam.origin_ec = NULL;  /* Filled in later */
    VG_(maybe_record_error)( tid, Err_MemParam, a, msg, &extra );
 }
 
@@ -3271,13 +4339,22 @@
                        /*allow_GDB_attach*/False, /*count_error*/False );
 }
 
-static void mc_record_user_error ( ThreadId tid, Addr a, Bool isAddrErr )
+static void mc_record_user_error ( ThreadId tid, Addr a,
+                                   Bool isAddrErr, UInt otag )
 {
    MC_Error extra;
-
+   if (otag != 0) {
+      tl_assert(!isAddrErr);
+      tl_assert( MC_(clo_mc_level) == 3 );
+   }
+   if (!isAddrErr) {
+      tl_assert( MC_(clo_mc_level) >= 2 );
+   }
    tl_assert(VG_INVALID_THREADID != tid);
    extra.Err.User.isAddrErr = isAddrErr;
    extra.Err.User.ai.tag    = Addr_Undescribed;
+   extra.Err.User.otag      = otag;
+   extra.Err.User.origin_ec = NULL;  /* Filled in later */
    VG_(maybe_record_error)( tid, Err_User, a, /*s*/NULL, &extra );
 }
 
@@ -3469,6 +4546,18 @@
    return;
 }
 
+/* Fill in *origin_ec as specified by otag, or NULL it out if otag
+   does not refer to a known origin. */
+static void update_origin ( /*OUT*/ExeContext** origin_ec,
+                            UInt otag )
+{
+   UInt ecu = otag & ~3;
+   *origin_ec = NULL;
+   if (VG_(is_plausible_ECU)(ecu)) {
+      *origin_ec = VG_(get_ExeContext_from_ECU)( ecu );
+   }
+}
+
 /* Updates the copy with address info if necessary (but not for all errors). */
 static UInt mc_update_extra( Error* err )
 {
@@ -3478,16 +4567,31 @@
    // These ones don't have addresses associated with them, and so don't
    // need any updating.
    case Err_CoreMem:
-   case Err_Value:
-   case Err_Cond:
+   //case Err_Value:
+   //case Err_Cond:
    case Err_Overlap:
-   case Err_RegParam:
    // For Err_Leaks the returned size does not matter -- they are always
    // shown with VG_(unique_error)() so they 'extra' not copied.  But
    // we make it consistent with the others.
    case Err_Leak:
       return sizeof(MC_Error);
 
+   // For value errors, get the ExeContext corresponding to the
+   // origin tag.  Note that it is a kludge to assume that 
+   // a length-1 trace indicates a stack origin.  FIXME.
+   case Err_Value:
+      update_origin( &extra->Err.Value.origin_ec,
+                     extra->Err.Value.otag );
+      return sizeof(MC_Error);
+   case Err_Cond:
+      update_origin( &extra->Err.Cond.origin_ec,
+                     extra->Err.Cond.otag );
+      return sizeof(MC_Error);
+   case Err_RegParam:
+      update_origin( &extra->Err.RegParam.origin_ec,
+                     extra->Err.RegParam.otag );
+      return sizeof(MC_Error);
+
    // These ones always involve a memory address.
    case Err_Addr:
       describe_addr ( VG_(get_error_address)(err),
@@ -3496,6 +4600,8 @@
    case Err_MemParam:
       describe_addr ( VG_(get_error_address)(err),
                       &extra->Err.MemParam.ai );
+      update_origin( &extra->Err.MemParam.origin_ec,
+                     extra->Err.MemParam.otag );
       return sizeof(MC_Error);
    case Err_Jump:
       describe_addr ( VG_(get_error_address)(err),
@@ -3504,6 +4610,8 @@
    case Err_User:
       describe_addr ( VG_(get_error_address)(err),
                       &extra->Err.User.ai );
+      update_origin( &extra->Err.User.origin_ec,
+                     extra->Err.User.otag );
       return sizeof(MC_Error);
    case Err_Free:
       describe_addr ( VG_(get_error_address)(err),
@@ -4183,29 +5291,57 @@
 /*--- Value-check failure handlers.                        ---*/
 /*------------------------------------------------------------*/
 
-void MC_(helperc_value_check0_fail) ( void )
-{
-   mc_record_cond_error ( VG_(get_running_tid)() );
+/* Call these ones when an origin is available ... */
+VG_REGPARM(1)
+void MC_(helperc_value_check0_fail_w_o) ( UWord origin ) {
+   mc_record_cond_error ( VG_(get_running_tid)(), (UInt)origin );
 }
 
-void MC_(helperc_value_check1_fail) ( void )
-{
-   mc_record_value_error ( VG_(get_running_tid)(), 1 );
+VG_REGPARM(1)
+void MC_(helperc_value_check1_fail_w_o) ( UWord origin ) {
+   mc_record_value_error ( VG_(get_running_tid)(), 1, (UInt)origin );
 }
 
-void MC_(helperc_value_check4_fail) ( void )
-{
-   mc_record_value_error ( VG_(get_running_tid)(), 4 );
+VG_REGPARM(1)
+void MC_(helperc_value_check4_fail_w_o) ( UWord origin ) {
+   mc_record_value_error ( VG_(get_running_tid)(), 4, (UInt)origin );
 }
 
-void MC_(helperc_value_check8_fail) ( void )
-{
-   mc_record_value_error ( VG_(get_running_tid)(), 8 );
+VG_REGPARM(1)
+void MC_(helperc_value_check8_fail_w_o) ( UWord origin ) {
+   mc_record_value_error ( VG_(get_running_tid)(), 8, (UInt)origin );
 }
 
-VG_REGPARM(1) void MC_(helperc_complain_undef) ( HWord sz )
-{
-   mc_record_value_error ( VG_(get_running_tid)(), (Int)sz );
+VG_REGPARM(2) 
+void MC_(helperc_value_checkN_fail_w_o) ( HWord sz, UWord origin ) {
+   mc_record_value_error ( VG_(get_running_tid)(), (Int)sz, (UInt)origin );
+}
+
+/* ... and these when an origin isn't available. */
+
+VG_REGPARM(0)
+void MC_(helperc_value_check0_fail_no_o) ( void ) {
+   mc_record_cond_error ( VG_(get_running_tid)(), 0/*origin*/ );
+}
+
+VG_REGPARM(0)
+void MC_(helperc_value_check1_fail_no_o) ( void ) {
+   mc_record_value_error ( VG_(get_running_tid)(), 1, 0/*origin*/ );
+}
+
+VG_REGPARM(0)
+void MC_(helperc_value_check4_fail_no_o) ( void ) {
+   mc_record_value_error ( VG_(get_running_tid)(), 4, 0/*origin*/ );
+}
+
+VG_REGPARM(0)
+void MC_(helperc_value_check8_fail_no_o) ( void ) {
+   mc_record_value_error ( VG_(get_running_tid)(), 8, 0/*origin*/ );
+}
+
+VG_REGPARM(1) 
+void MC_(helperc_value_checkN_fail_no_o) ( HWord sz ) {
+   mc_record_value_error ( VG_(get_running_tid)(), (Int)sz, 0/*origin*/ );
 }
 
 
@@ -4297,7 +5433,7 @@
    } else {
       tl_assert(VG_IS_8_ALIGNED(a));
    }
-   if (is_mem_defined( a, sizeof(UWord), NULL ) == MC_Ok
+   if (is_mem_defined( a, sizeof(UWord), NULL, NULL) == MC_Ok
        && !in_ignored_range(a)) {
       return True;
    } else {
@@ -4367,9 +5503,12 @@
 
 static Bool mc_cheap_sanity_check ( void )
 {
-   /* nothing useful we can rapidly check */
    n_sanity_cheap++;
    PROF_EVENT(490, "cheap_sanity_check");
+   /* Check for sane operating level */
+   if (MC_(clo_mc_level) < 1 || MC_(clo_mc_level) > 3)
+      return False;
+   /* nothing else useful we can rapidly check */
    return True;
 }
 
@@ -4387,6 +5526,10 @@
    n_sanity_expensive++;
    PROF_EVENT(491, "expensive_sanity_check");
 
+   /* Check for sane operating level */
+   if (MC_(clo_mc_level) < 1 || MC_(clo_mc_level) > 3)
+      return False;
+
    /* Check that the 3 distinguished SMs are still as they should be. */
 
    /* Check noaccess DSM. */
@@ -4415,7 +5558,7 @@
 
    /* If we're not checking for undefined value errors, the secondary V bit
     * table should be empty. */
-   if (!MC_(clo_undef_value_errors)) {
+   if (MC_(clo_mc_level) == 1) {
       if (0 != VG_(OSetGen_Size)(secVBitTable))
          return False;
    }
@@ -4472,18 +5615,50 @@
 VgRes         MC_(clo_leak_resolution)        = Vg_LowRes;
 Bool          MC_(clo_show_reachable)         = False;
 Bool          MC_(clo_workaround_gcc296_bugs) = False;
-Bool          MC_(clo_undef_value_errors)     = True;
 Int           MC_(clo_malloc_fill)            = -1;
 Int           MC_(clo_free_fill)              = -1;
+Int           MC_(clo_mc_level)               = 2;
 
 static Bool mc_process_cmd_line_options(Char* arg)
 {
+   tl_assert( MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3 );
+
+   /* Set MC_(clo_mc_level):
+         1 = A bit tracking only
+         2 = A and V bit tracking, but no V bit origins
+         3 = A and V bit tracking, and V bit origins
+
+      Do this by inspecting --undef-value-errors= and
+      --track-origins=.  Reject the case --undef-value-errors=no
+      --track-origins=yes as meaningless.
+   */
+   if (0 == VG_(strcmp)(arg, "--undef-value-errors=no")) {
+      if (MC_(clo_mc_level) == 3)
+         goto mc_level_error;
+      MC_(clo_mc_level) = 1;
+      return True;
+   }
+   if (0 == VG_(strcmp)(arg, "--undef-value-errors=yes")) {
+      if (MC_(clo_mc_level) == 1)
+         MC_(clo_mc_level) = 2;
+      return True;
+   }
+   if (0 == VG_(strcmp)(arg, "--track-origins=no")) {
+      if (MC_(clo_mc_level) == 3)
+         MC_(clo_mc_level) = 2;
+      return True;
+   }
+   if (0 == VG_(strcmp)(arg, "--track-origins=yes")) {
+      if (MC_(clo_mc_level) == 1)
+         goto mc_level_error;
+      MC_(clo_mc_level) = 3;
+      return True;
+   }
+
 	VG_BOOL_CLO(arg, "--partial-loads-ok",      MC_(clo_partial_loads_ok))
    else VG_BOOL_CLO(arg, "--show-reachable",        MC_(clo_show_reachable))
    else VG_BOOL_CLO(arg, "--workaround-gcc296-bugs",MC_(clo_workaround_gcc296_bugs))
 
-   else VG_BOOL_CLO(arg, "--undef-value-errors",    MC_(clo_undef_value_errors))
-   
    else VG_BNUM_CLO(arg, "--freelist-vol",  MC_(clo_freelist_vol), 
                                             0, 10*1000*1000*1000LL)
    
@@ -4538,6 +5713,12 @@
       return VG_(replacement_malloc_process_cmd_line_option)(arg);
 
    return True;
+   /*NOTREACHED*/
+
+  mc_level_error:
+   VG_(message)(Vg_DebugMsg, "ERROR: --track-origins=yes has no effect "
+                             "when --undef-value-errors=no");
+   return False;
 }
 
 static void mc_print_usage(void)
@@ -4547,6 +5728,7 @@
 "    --leak-resolution=low|med|high   how much bt merging in leak check [low]\n"
 "    --show-reachable=no|yes          show reachable blocks in leak check? [no]\n"
 "    --undef-value-errors=no|yes      check for undefined value errors [yes]\n"
+"    --track-origins=no|yes           show origins of undefined values? [no]\n"
 "    --partial-loads-ok=no|yes        too hard to explain here; see manual [no]\n"
 "    --freelist-vol=<number>          volume of freed blocks queue [10000000]\n"
 "    --workaround-gcc296-bugs=no|yes  self explanatory [no]\n"
@@ -4720,17 +5902,18 @@
       case VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE:
          ok = is_mem_addressable ( arg[1], arg[2], &bad_addr );
          if (!ok)
-            mc_record_user_error ( tid, bad_addr, /*isAddrErr*/True );
+            mc_record_user_error ( tid, bad_addr, /*isAddrErr*/True, 0 );
          *ret = ok ? (UWord)NULL : bad_addr;
          break;
 
       case VG_USERREQ__CHECK_MEM_IS_DEFINED: {
          MC_ReadResult res;
-         res = is_mem_defined ( arg[1], arg[2], &bad_addr );
+         UInt otag = 0;
+         res = is_mem_defined ( arg[1], arg[2], &bad_addr, &otag );
          if (MC_AddrErr == res)
-            mc_record_user_error ( tid, bad_addr, /*isAddrErr*/True );
+            mc_record_user_error ( tid, bad_addr, /*isAddrErr*/True, 0 );
          else if (MC_ValueErr == res)
-            mc_record_user_error ( tid, bad_addr, /*isAddrErr*/False );
+            mc_record_user_error ( tid, bad_addr, /*isAddrErr*/False, otag );
          *ret = ( res==MC_Ok ? (UWord)NULL : bad_addr );
          break;
       }
@@ -4746,7 +5929,7 @@
          break;
 
       case VG_USERREQ__MAKE_MEM_UNDEFINED:
-         MC_(make_mem_undefined) ( arg[1], arg[2] );
+         make_mem_undefined_w_tid_and_okind ( arg[1], arg[2], tid, MC_OKIND_USER );
          *ret = -1;
          break;
 
@@ -4963,10 +6146,335 @@
 
 #endif
 
+
+/*------------------------------------------------------------*/
+/*--- Origin tracking stuff                                ---*/
+/*------------------------------------------------------------*/
+
+/*--------------------------------------------*/
+/*--- Origin tracking: load handlers       ---*/
+/*--------------------------------------------*/
+
+static INLINE UInt merge_origins ( UInt or1, UInt or2 ) {
+   return or1 > or2 ? or1 : or2;
+}
+
+UWord VG_REGPARM(1) MC_(helperc_b_load1)( Addr a ) {
+   OCacheLine* line;
+   UChar descr;
+   UWord lineoff = oc_line_offset(a);
+   UWord byteoff = a & 3; /* 0, 1, 2 or 3 */
+
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
+   }
+
+   line = find_OCacheLine( a );
+
+   descr = line->descr[lineoff];
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(descr < 0x10);
+   }
+
+   if (LIKELY(0 == (descr & (1 << byteoff))))  {
+      return 0;
+   } else {
+      return line->w32[lineoff];
+   }
+}
+
+UWord VG_REGPARM(1) MC_(helperc_b_load2)( Addr a ) {
+   OCacheLine* line;
+   UChar descr;
+   UWord lineoff, byteoff;
+
+   if (UNLIKELY(a & 1)) {
+      /* Handle misaligned case, slowly. */
+      UInt oLo   = (UInt)MC_(helperc_b_load1)( a + 0 );
+      UInt oHi   = (UInt)MC_(helperc_b_load1)( a + 1 );
+      return merge_origins(oLo, oHi);
+   }
+
+   lineoff = oc_line_offset(a);
+   byteoff = a & 3; /* 0 or 2 */
+
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
+   }
+   line = find_OCacheLine( a );
+
+   descr = line->descr[lineoff];
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(descr < 0x10);
+   }
+
+   if (LIKELY(0 == (descr & (3 << byteoff)))) {
+      return 0;
+   } else {
+      return line->w32[lineoff];
+   }
+}
+
+UWord VG_REGPARM(1) MC_(helperc_b_load4)( Addr a ) {
+   OCacheLine* line;
+   UChar descr;
+   UWord lineoff;
+
+   if (UNLIKELY(a & 3)) {
+      /* Handle misaligned case, slowly. */
+      UInt oLo   = (UInt)MC_(helperc_b_load2)( a + 0 );
+      UInt oHi   = (UInt)MC_(helperc_b_load2)( a + 2 );
+      return merge_origins(oLo, oHi);
+   }
+
+   lineoff = oc_line_offset(a);
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
+   }
+
+   line = find_OCacheLine( a );
+
+   descr = line->descr[lineoff];
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(descr < 0x10);
+   }
+
+   if (LIKELY(0 == descr)) {
+      return 0;
+   } else {
+      return line->w32[lineoff];
+   }
+}
+
+UWord VG_REGPARM(1) MC_(helperc_b_load8)( Addr a ) {
+   OCacheLine* line;
+   UChar descrLo, descrHi, descr;
+   UWord lineoff;
+
+   if (UNLIKELY(a & 7)) {
+      /* Handle misaligned case, slowly. */
+      UInt oLo   = (UInt)MC_(helperc_b_load4)( a + 0 );
+      UInt oHi   = (UInt)MC_(helperc_b_load4)( a + 4 );
+      return merge_origins(oLo, oHi);
+   }
+
+   lineoff = oc_line_offset(a);
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(lineoff == (lineoff & 6)); /*0,2,4,6*//*since 8-aligned*/
+   }
+
+   line = find_OCacheLine( a );
+
+   descrLo = line->descr[lineoff + 0];
+   descrHi = line->descr[lineoff + 1];
+   descr   = descrLo | descrHi;
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(descr < 0x10);
+   }
+
+   if (LIKELY(0 == descr)) {
+      return 0; /* both 32-bit chunks are defined */
+   } else {
+      UInt oLo = descrLo == 0 ? 0 : line->w32[lineoff + 0];
+      UInt oHi = descrHi == 0 ? 0 : line->w32[lineoff + 1];
+      return merge_origins(oLo, oHi);
+   }
+}
+
+UWord VG_REGPARM(1) MC_(helperc_b_load16)( Addr a ) {
+   UInt oLo   = (UInt)MC_(helperc_b_load8)( a + 0 );
+   UInt oHi   = (UInt)MC_(helperc_b_load8)( a + 8 );
+   UInt oBoth = merge_origins(oLo, oHi);
+   return (UWord)oBoth;
+}
+
+
+/*--------------------------------------------*/
+/*--- Origin tracking: store handlers      ---*/
+/*--------------------------------------------*/
+
+void VG_REGPARM(2) MC_(helperc_b_store1)( Addr a, UWord d32 ) {
+   OCacheLine* line;
+   UWord lineoff = oc_line_offset(a);
+   UWord byteoff = a & 3; /* 0, 1, 2 or 3 */
+
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
+   }
+
+   line = find_OCacheLine( a );
+
+   if (d32 == 0) {
+      line->descr[lineoff] &= ~(1 << byteoff);
+   } else {
+      line->descr[lineoff] |= (1 << byteoff);
+      line->w32[lineoff] = d32;
+   }
+}
+
+void VG_REGPARM(2) MC_(helperc_b_store2)( Addr a, UWord d32 ) {
+   OCacheLine* line;
+   UWord lineoff, byteoff;
+
+   if (UNLIKELY(a & 1)) {
+      /* Handle misaligned case, slowly. */
+      MC_(helperc_b_store1)( a + 0, d32 );
+      MC_(helperc_b_store1)( a + 1, d32 );
+      return;
+   }
+
+   lineoff = oc_line_offset(a);
+   byteoff = a & 3; /* 0 or 2 */
+
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
+   }
+
+   line = find_OCacheLine( a );
+
+   if (d32 == 0) {
+      line->descr[lineoff] &= ~(3 << byteoff);
+   } else {
+      line->descr[lineoff] |= (3 << byteoff);
+      line->w32[lineoff] = d32;
+   }
+}
+
+void VG_REGPARM(2) MC_(helperc_b_store4)( Addr a, UWord d32 ) {
+   OCacheLine* line;
+   UWord lineoff;
+
+   if (UNLIKELY(a & 3)) {
+      /* Handle misaligned case, slowly. */
+      MC_(helperc_b_store2)( a + 0, d32 );
+      MC_(helperc_b_store2)( a + 2, d32 );
+      return;
+   }
+
+   lineoff = oc_line_offset(a);
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE);
+   }
+
+   line = find_OCacheLine( a );
+
+   if (d32 == 0) {
+      line->descr[lineoff] = 0;
+   } else {
+      line->descr[lineoff] = 0xF;
+      line->w32[lineoff] = d32;
+   }
+}
+
+void VG_REGPARM(2) MC_(helperc_b_store8)( Addr a, UWord d32 ) {
+   OCacheLine* line;
+   UWord lineoff;
+
+   if (UNLIKELY(a & 7)) {
+      /* Handle misaligned case, slowly. */
+      MC_(helperc_b_store4)( a + 0, d32 );
+      MC_(helperc_b_store4)( a + 4, d32 );
+      return;
+   }
+
+   lineoff = oc_line_offset(a);
+   if (OC_ENABLE_ASSERTIONS) {
+      tl_assert(lineoff == (lineoff & 6)); /*0,2,4,6*//*since 8-aligned*/
+   }
+
+   line = find_OCacheLine( a );
+
+   if (d32 == 0) {
+      line->descr[lineoff + 0] = 0;
+      line->descr[lineoff + 1] = 0;
+   } else {
+      line->descr[lineoff + 0] = 0xF;
+      line->descr[lineoff + 1] = 0xF;
+      line->w32[lineoff + 0] = d32;
+      line->w32[lineoff + 1] = d32;
+   }
+}
+
+void VG_REGPARM(2) MC_(helperc_b_store16)( Addr a, UWord d32 ) {
+   MC_(helperc_b_store8)( a + 0, d32 );
+   MC_(helperc_b_store8)( a + 8, d32 );
+}
+
+
+/*--------------------------------------------*/
+/*--- Origin tracking: sarp handlers       ---*/
+/*--------------------------------------------*/
+
+__attribute__((noinline))
+static void ocache_sarp_Set_Origins ( Addr a, UWord len, UInt otag ) {
+   if ((a & 1) && len >= 1) {
+      MC_(helperc_b_store1)( a, otag );
+      a++;
+      len--;
+   }
+   if ((a & 2) && len >= 2) {
+      MC_(helperc_b_store2)( a, otag );
+      a += 2;
+      len -= 2;
+   }
+   if (len >= 4) 
+      tl_assert(0 == (a & 3));
+   while (len >= 4) {
+      MC_(helperc_b_store4)( a, otag );
+      a += 4;
+      len -= 4;
+   }
+   if (len >= 2) {
+      MC_(helperc_b_store2)( a, otag );
+      a += 2;
+      len -= 2;
+   }
+   if (len >= 1) {
+      MC_(helperc_b_store1)( a, otag );
+      a++;
+      len--;
+   }
+   tl_assert(len == 0);
+}
+
+__attribute__((noinline))
+static void ocache_sarp_Clear_Origins ( Addr a, UWord len ) {
+   if ((a & 1) && len >= 1) {
+      MC_(helperc_b_store1)( a, 0 );
+      a++;
+      len--;
+   }
+   if ((a & 2) && len >= 2) {
+      MC_(helperc_b_store2)( a, 0 );
+      a += 2;
+      len -= 2;
+   }
+   if (len >= 4) 
+      tl_assert(0 == (a & 3));
+   while (len >= 4) {
+      MC_(helperc_b_store4)( a, 0 );
+      a += 4;
+      len -= 4;
+   }
+   if (len >= 2) {
+      MC_(helperc_b_store2)( a, 0 );
+      a += 2;
+      len -= 2;
+   }
+   if (len >= 1) {
+      MC_(helperc_b_store1)( a, 0 );
+      a++;
+      len--;
+   }
+   tl_assert(len == 0);
+}
+
+
 /*------------------------------------------------------------*/
 /*--- Setup and finalisation                               ---*/
 /*------------------------------------------------------------*/
 
+
 static void mc_post_clo_init ( void )
 {
    /* If we've been asked to emit XML, mash around various other
@@ -4976,6 +6484,38 @@
       /* MC_(clo_show_reachable) = True; */
       MC_(clo_leak_check) = LC_Full;
    }
+
+   tl_assert( MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3 );
+
+   if (MC_(clo_mc_level) == 3) {
+      /* We're doing origin tracking. */
+#     ifdef PERF_FAST_STACK
+      VG_(track_new_mem_stack_4_w_ECU)   ( mc_new_mem_stack_4_w_ECU   );
+      VG_(track_new_mem_stack_8_w_ECU)   ( mc_new_mem_stack_8_w_ECU   );
+      VG_(track_new_mem_stack_12_w_ECU)  ( mc_new_mem_stack_12_w_ECU  );
+      VG_(track_new_mem_stack_16_w_ECU)  ( mc_new_mem_stack_16_w_ECU  );
+      VG_(track_new_mem_stack_32_w_ECU)  ( mc_new_mem_stack_32_w_ECU  );
+      VG_(track_new_mem_stack_112_w_ECU) ( mc_new_mem_stack_112_w_ECU );
+      VG_(track_new_mem_stack_128_w_ECU) ( mc_new_mem_stack_128_w_ECU );
+      VG_(track_new_mem_stack_144_w_ECU) ( mc_new_mem_stack_144_w_ECU );
+      VG_(track_new_mem_stack_160_w_ECU) ( mc_new_mem_stack_160_w_ECU );
+#     endif
+      VG_(track_new_mem_stack_w_ECU)     ( mc_new_mem_stack_w_ECU     );
+   } else {
+      /* Not doing origin tracking */
+#     ifdef PERF_FAST_STACK
+      VG_(track_new_mem_stack_4)   ( mc_new_mem_stack_4   );
+      VG_(track_new_mem_stack_8)   ( mc_new_mem_stack_8   );
+      VG_(track_new_mem_stack_12)  ( mc_new_mem_stack_12  );
+      VG_(track_new_mem_stack_16)  ( mc_new_mem_stack_16  );
+      VG_(track_new_mem_stack_32)  ( mc_new_mem_stack_32  );
+      VG_(track_new_mem_stack_112) ( mc_new_mem_stack_112 );
+      VG_(track_new_mem_stack_128) ( mc_new_mem_stack_128 );
+      VG_(track_new_mem_stack_144) ( mc_new_mem_stack_144 );
+      VG_(track_new_mem_stack_160) ( mc_new_mem_stack_160 );
+#     endif
+      VG_(track_new_mem_stack)     ( mc_new_mem_stack     );
+   }
 }
 
 static void print_SM_info(char* type, int n_SMs)
@@ -5000,6 +6540,15 @@
       VG_(message)(Vg_UserMsg, 
                    "For counts of detected errors, rerun with: -v");
    }
+
+
+   if (any_value_errors && !VG_(clo_xml) && VG_(clo_verbosity) >= 1
+       && MC_(clo_mc_level) == 2) {
+      VG_(message)(Vg_UserMsg,
+                   "Use --track-origins=yes to see where "
+                   "uninitialised values come from");
+   }
+
    if (MC_(clo_leak_check) != LC_Off)
       mc_detect_memory_leaks(1/*bogus ThreadId*/, MC_(clo_leak_check));
 
@@ -5054,6 +6603,39 @@
       VG_(message)(Vg_DebugMsg,
          " memcheck: max shadow mem size:   %dk, %dM",
          max_shmem_szB / 1024, max_shmem_szB / (1024 * 1024));
+
+      if (MC_(clo_mc_level) >= 3) {
+         VG_(message)(Vg_DebugMsg,
+                      " ocacheL1: %,12lu refs   %,12lu misses (%,lu lossage)", 
+                      stats_ocacheL1_find, 
+                      stats_ocacheL1_misses,
+                      stats_ocacheL1_lossage );
+         VG_(message)(Vg_DebugMsg,
+                      " ocacheL1: %,12lu at 0   %,12lu at 1", 
+                      stats_ocacheL1_find - stats_ocacheL1_misses 
+                         - stats_ocacheL1_found_at_1 
+                         - stats_ocacheL1_found_at_N,
+                      stats_ocacheL1_found_at_1 );
+         VG_(message)(Vg_DebugMsg,
+                      " ocacheL1: %,12lu at 2+  %,12lu move-fwds", 
+                      stats_ocacheL1_found_at_N,
+                      stats_ocacheL1_movefwds );
+         VG_(message)(Vg_DebugMsg,
+                      " ocacheL1: %,12lu sizeB  %,12lu useful",
+                      (UWord)sizeof(OCache),
+                      4 * OC_W32S_PER_LINE * OC_LINES_PER_SET * OC_N_SETS );
+         VG_(message)(Vg_DebugMsg,
+                      " ocacheL2: %,12lu refs   %,12lu misses", 
+                      stats__ocacheL2_refs, 
+                      stats__ocacheL2_misses );
+         VG_(message)(Vg_DebugMsg,
+                      " ocacheL2:    %,9lu max nodes %,9lu curr nodes",
+                      stats__ocacheL2_n_nodes_max,
+                      stats__ocacheL2_n_nodes );
+         VG_(message)(Vg_DebugMsg,
+                      " niacache: %,12lu refs   %,12lu misses",
+                      stats__nia_cache_queries, stats__nia_cache_misses);
+      }
    }
 
    if (0) {
@@ -5110,8 +6692,8 @@
    VG_(needs_xml_output)          ();
 
    VG_(track_new_mem_startup)     ( mc_new_mem_startup );
-   VG_(track_new_mem_stack_signal)( MC_(make_mem_undefined) );
-   VG_(track_new_mem_brk)         ( MC_(make_mem_undefined) );
+   VG_(track_new_mem_stack_signal)( make_mem_undefined_w_tid );
+   VG_(track_new_mem_brk)         ( make_mem_undefined_w_tid );
    VG_(track_new_mem_mmap)        ( mc_new_mem_mmap );
    
    VG_(track_copy_mem_remap)      ( MC_(copy_address_range_state) );
@@ -5131,20 +6713,11 @@
    VG_(track_die_mem_brk)         ( MC_(make_mem_noaccess) );
    VG_(track_die_mem_munmap)      ( MC_(make_mem_noaccess) ); 
 
-#ifdef PERF_FAST_STACK
-   VG_(track_new_mem_stack_4)     ( mc_new_mem_stack_4   );
-   VG_(track_new_mem_stack_8)     ( mc_new_mem_stack_8   );
-   VG_(track_new_mem_stack_12)    ( mc_new_mem_stack_12  );
-   VG_(track_new_mem_stack_16)    ( mc_new_mem_stack_16  );
-   VG_(track_new_mem_stack_32)    ( mc_new_mem_stack_32  );
-   VG_(track_new_mem_stack_112)   ( mc_new_mem_stack_112 );
-   VG_(track_new_mem_stack_128)   ( mc_new_mem_stack_128 );
-   VG_(track_new_mem_stack_144)   ( mc_new_mem_stack_144 );
-   VG_(track_new_mem_stack_160)   ( mc_new_mem_stack_160 );
-#endif
-   VG_(track_new_mem_stack)       ( mc_new_mem_stack     );
+   /* Defer the specification of the new_mem_stack functions to the
+      post_clo_init function, since we need to first parse the command
+      line before deciding which set to use. */
 
-#ifdef PERF_FAST_STACK
+#  ifdef PERF_FAST_STACK
    VG_(track_die_mem_stack_4)     ( mc_die_mem_stack_4   );
    VG_(track_die_mem_stack_8)     ( mc_die_mem_stack_8   );
    VG_(track_die_mem_stack_12)    ( mc_die_mem_stack_12  );
@@ -5154,7 +6727,7 @@
    VG_(track_die_mem_stack_128)   ( mc_die_mem_stack_128 );
    VG_(track_die_mem_stack_144)   ( mc_die_mem_stack_144 );
    VG_(track_die_mem_stack_160)   ( mc_die_mem_stack_160 );
-#endif
+#  endif
    VG_(track_die_mem_stack)       ( mc_die_mem_stack     );
    
    VG_(track_ban_mem_stack)       ( MC_(make_mem_noaccess) );
@@ -5164,7 +6737,7 @@
    VG_(track_pre_mem_write)       ( check_mem_is_addressable );
    VG_(track_post_mem_write)      ( mc_post_mem_write );
 
-   if (MC_(clo_undef_value_errors))
+   if (MC_(clo_mc_level) >= 2)
       VG_(track_pre_reg_read)     ( mc_pre_reg_read );
 
    VG_(track_post_reg_write)                  ( mc_post_reg_write );
@@ -5184,6 +6757,9 @@
 
    // BYTES_PER_SEC_VBIT_NODE must be a power of two.
    tl_assert(-1 != VG_(log2)(BYTES_PER_SEC_VBIT_NODE));
+
+   init_OCache();
+   init_nia_to_ecu_cache();
 }
 
 VG_DETERMINE_INTERFACE_VERSION(mc_pre_clo_init)
diff --git a/memcheck/mc_malloc_wrappers.c b/memcheck/mc_malloc_wrappers.c
index 0c2eb72..e3287a8 100644
--- a/memcheck/mc_malloc_wrappers.c
+++ b/memcheck/mc_malloc_wrappers.c
@@ -132,14 +132,14 @@
 
 /* Allocate its shadow chunk, put it on the appropriate list. */
 static
-MC_Chunk* create_MC_Chunk ( ThreadId tid, Addr p, SizeT szB,
+MC_Chunk* create_MC_Chunk ( ExeContext* ec, Addr p, SizeT szB,
                             MC_AllocKind kind)
 {
    MC_Chunk* mc  = VG_(malloc)(sizeof(MC_Chunk));
    mc->data      = p;
    mc->szB       = szB;
    mc->allockind = kind;
-   mc->where     = VG_(record_ExeContext)(tid, 0/*first_ip_delta*/);
+   mc->where     = ec;
 
    /* Paranoia ... ensure the MC_Chunk is off-limits to the client, so
       the mc->data field isn't visible to the leak checker.  If memory
@@ -186,6 +186,8 @@
                        Addr p, SizeT szB, SizeT alignB, UInt rzB,
                        Bool is_zeroed, MC_AllocKind kind, VgHashTable table)
 {
+   ExeContext* ec;
+
    cmalloc_n_mallocs ++;
 
    // Allocate and zero if necessary
@@ -209,12 +211,18 @@
    // Only update this stat if allocation succeeded.
    cmalloc_bs_mallocd += (ULong)szB;
 
-   VG_(HT_add_node)( table, create_MC_Chunk(tid, p, szB, kind) );
+   ec = VG_(record_ExeContext)(tid, 0/*first_ip_delta*/);
+   tl_assert(ec);
+
+   VG_(HT_add_node)( table, create_MC_Chunk(ec, p, szB, kind) );
 
    if (is_zeroed)
       MC_(make_mem_defined)( p, szB );
-   else
-      MC_(make_mem_undefined)( p, szB );
+   else {
+      UInt ecu = VG_(get_ECU_from_ExeContext)(ec);
+      tl_assert(VG_(is_plausible_ECU)(ecu));
+      MC_(make_mem_undefined_w_otag)( p, szB, ecu | MC_OKIND_HEAP );
+   }
 
    return (void*)p;
 }
@@ -390,11 +398,21 @@
       a_new = (Addr)VG_(cli_malloc)(VG_(clo_alignment), new_szB);
 
       if (a_new) {
+         UInt        ecu;
+         ExeContext* ec;
+
+         ec = VG_(record_ExeContext)(tid, 0/*first_ip_delta*/);
+         tl_assert(ec);
+         ecu = VG_(get_ECU_from_ExeContext)(ec);
+         tl_assert(VG_(is_plausible_ECU)(ecu));
+
          /* First half kept and copied, second half new, red zones as normal */
-         MC_(make_mem_noaccess)( a_new-MC_MALLOC_REDZONE_SZB, MC_MALLOC_REDZONE_SZB );
-         MC_(copy_address_range_state)( (Addr)p_old, a_new, mc->szB );
-         MC_(make_mem_undefined)( a_new+mc->szB, new_szB-mc->szB );
-         MC_(make_mem_noaccess) ( a_new+new_szB, MC_MALLOC_REDZONE_SZB );
+         MC_(make_mem_noaccess)( a_new-MC_MALLOC_REDZONE_SZB, 
+                                 MC_MALLOC_REDZONE_SZB );
+         MC_(copy_address_range_state) ( (Addr)p_old, a_new, mc->szB );
+         MC_(make_mem_undefined_w_otag)( a_new+mc->szB, new_szB-mc->szB,
+                                                        ecu | MC_OKIND_HEAP );
+         MC_(make_mem_noaccess)        ( a_new+new_szB, MC_MALLOC_REDZONE_SZB );
 
          /* Possibly fill new area with specified junk */
          if (MC_(clo_malloc_fill) != -1) {
@@ -420,7 +438,7 @@
          die_and_free_mem ( tid, mc, MC_MALLOC_REDZONE_SZB );
 
          // Allocate a new chunk.
-         mc = create_MC_Chunk( tid, a_new, new_szB, MC_AllocMalloc );
+         mc = create_MC_Chunk( ec, a_new, new_szB, MC_AllocMalloc );
       }
 
       p_new = (void*)a_new;
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 81644cc..67dab52 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -35,12 +35,13 @@
 #include "pub_tool_libcprint.h"
 #include "pub_tool_tooliface.h"
 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
-#include "mc_include.h"
-
 #include "pub_tool_xarray.h"
 #include "pub_tool_mallocfree.h"
 #include "pub_tool_libcbase.h"
 
+#include "mc_include.h"
+
+
 /* This file implements the Memcheck instrumentation, and in
    particular contains the core of its undefined value detection
    machinery.  For a comprehensive background of the terminology,
@@ -116,7 +117,7 @@
 
 struct _MCEnv;
 
-static IRType  shadowType ( IRType ty );
+static IRType  shadowTypeV ( IRType ty );
 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
 
 
@@ -130,6 +131,7 @@
       /* MODIFIED: the superblock being constructed.  IRStmts are
          added. */
       IRSB* bb;
+      Bool  trace;
 
       /* MODIFIED: a table [0 .. #temps_in_original_bb-1] which maps
          original temps to their current their current shadow temp.
@@ -139,7 +141,8 @@
          point original tmps are shadowed by integer tmps of the same
          size, and Bit-typed original tmps are shadowed by the type
          Ity_I8.  See comment below. */
-      IRTemp* tmpMap;
+      IRTemp* tmpMapV;        /* V-bit tmp shadows */
+      IRTemp* tmpMapB; /* origin tracking tmp shadows */
       Int     n_originalTmps; /* for range checking */
 
       /* MODIFIED: indicates whether "bogus" literals have so far been
@@ -183,15 +186,15 @@
 
 /* Find the tmp currently shadowing the given original tmp.  If none
    so far exists, allocate one.  */
-static IRTemp findShadowTmp ( MCEnv* mce, IRTemp orig )
+static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
 {
    tl_assert(orig < mce->n_originalTmps);
-   if (mce->tmpMap[orig] == IRTemp_INVALID) {
-      mce->tmpMap[orig] 
+   if (mce->tmpMapV[orig] == IRTemp_INVALID) {
+      mce->tmpMapV[orig] 
          = newIRTemp(mce->bb->tyenv, 
-                     shadowType(mce->bb->tyenv->types[orig]));
+                     shadowTypeV(mce->bb->tyenv->types[orig]));
    }
-   return mce->tmpMap[orig];
+   return mce->tmpMapV[orig];
 }
 
 /* Allocate a new shadow for the given original tmp.  This means any
@@ -200,12 +203,12 @@
    for undefinedness, but unfortunately IR's SSA property disallows
    this.  Instead we must abandon the old shadow, allocate a new one
    and use that instead. */
-static void newShadowTmp ( MCEnv* mce, IRTemp orig )
+static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
 {
    tl_assert(orig < mce->n_originalTmps);
-   mce->tmpMap[orig] 
+   mce->tmpMapV[orig] 
       = newIRTemp(mce->bb->tyenv, 
-                  shadowType(mce->bb->tyenv->types[orig]));
+                  shadowTypeV(mce->bb->tyenv->types[orig]));
 }
 
 
@@ -265,7 +268,7 @@
    given type.  The only valid shadow types are Bit, I8, I16, I32,
    I64, V128. */
 
-static IRType shadowType ( IRType ty )
+static IRType shadowTypeV ( IRType ty )
 {
    switch (ty) {
       case Ity_I1:
@@ -278,7 +281,7 @@
       case Ity_F64:  return Ity_I64;
       case Ity_V128: return Ity_V128;
       default: ppIRType(ty); 
-               VG_(tool_panic)("memcheck:shadowType");
+               VG_(tool_panic)("memcheck:shadowTypeV");
    }
 }
 
@@ -301,13 +304,21 @@
 /*--- Constructing IR fragments                            ---*/
 /*------------------------------------------------------------*/
 
-/* assign value to tmp */
-#define assign(_bb,_tmp,_expr)   \
-   addStmtToIRSB((_bb), IRStmt_WrTmp((_tmp),(_expr)))
-
 /* add stmt to a bb */
-#define stmt(_bb,_stmt)    \
-   addStmtToIRSB((_bb), (_stmt))
+static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
+   if (mce->trace) {
+      VG_(printf)("  %c: ", cat);
+      ppIRStmt(st);
+      VG_(printf)("\n");
+   }
+   addStmtToIRSB(mce->bb, st);
+}
+
+/* assign value to tmp */
+static inline 
+void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
+  stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
+}
 
 /* build various kinds of expressions */
 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
@@ -319,12 +330,20 @@
 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 
-/* bind the given expression to a new temporary, and return the
+/* Bind the given expression to a new temporary, and return the
    temporary.  This effectively converts an arbitrary expression into
-   an atom. */
-static IRAtom* assignNew ( MCEnv* mce, IRType ty, IRExpr* e ) {
-   IRTemp t = newIRTemp(mce->bb->tyenv, ty);
-   assign(mce->bb, t, e);
+   an atom.
+
+   'ty' is the type of 'e' and hence the type that the new temporary
+   needs to be.  But passing it is redundant, since we can deduce the
+   type merely by inspecting 'e'.  So at least that fact to assert
+   that the two types agree. */
+static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e ) {
+   IRTemp t;
+   IRType tyE = typeOfIRExpr(mce->bb->tyenv, e);
+   tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
+   t = newIRTemp(mce->bb->tyenv, ty);
+   assign(cat, mce, t, e);
    return mkexpr(t);
 }
 
@@ -338,31 +357,31 @@
 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I8, binop(Iop_And8, a1, a2));
+   return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
 }
 
 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I16, binop(Iop_And16, a1, a2));
+   return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
 }
 
 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I32, binop(Iop_And32, a1, a2));
+   return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
 }
 
 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I64, binop(Iop_And64, a1, a2));
+   return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
 }
 
 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_V128, binop(Iop_AndV128, a1, a2));
+   return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 }
 
 /* --------- Undefined-if-either-undefined --------- */
@@ -370,31 +389,31 @@
 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I8, binop(Iop_Or8, a1, a2));
+   return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
 }
 
 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I16, binop(Iop_Or16, a1, a2));
+   return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
 }
 
 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I32, binop(Iop_Or32, a1, a2));
+   return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
 }
 
 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_I64, binop(Iop_Or64, a1, a2));
+   return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
 }
 
 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    tl_assert(isShadowAtom(mce,a1));
    tl_assert(isShadowAtom(mce,a2));
-   return assignNew(mce, Ity_V128, binop(Iop_OrV128, a1, a2));
+   return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 }
 
 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
@@ -414,22 +433,22 @@
 
 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
    tl_assert(isShadowAtom(mce,a1));
-   return assignNew(mce, Ity_I8, unop(Iop_Left8, a1));
+   return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
 }
 
 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
    tl_assert(isShadowAtom(mce,a1));
-   return assignNew(mce, Ity_I16, unop(Iop_Left16, a1));
+   return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
 }
 
 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
    tl_assert(isShadowAtom(mce,a1));
-   return assignNew(mce, Ity_I32, unop(Iop_Left32, a1));
+   return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
 }
 
 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
    tl_assert(isShadowAtom(mce,a1));
-   return assignNew(mce, Ity_I64, unop(Iop_Left64, a1));
+   return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 }
 
 /* --------- 'Improvement' functions for AND/OR. --------- */
@@ -442,7 +461,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_I8, binop(Iop_Or8, data, vbits));
+   return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
 }
 
 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
@@ -450,7 +469,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_I16, binop(Iop_Or16, data, vbits));
+   return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
 }
 
 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
@@ -458,7 +477,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_I32, binop(Iop_Or32, data, vbits));
+   return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
 }
 
 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
@@ -466,7 +485,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_I64, binop(Iop_Or64, data, vbits));
+   return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
 }
 
 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
@@ -474,7 +493,7 @@
    tl_assert(isOriginalAtom(mce, data));
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
-   return assignNew(mce, Ity_V128, binop(Iop_OrV128, data, vbits));
+   return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 }
 
 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
@@ -486,9 +505,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_I8, 
+             'V', mce, Ity_I8, 
              binop(Iop_Or8, 
-                   assignNew(mce, Ity_I8, unop(Iop_Not8, data)), 
+                   assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)), 
                    vbits) );
 }
 
@@ -498,9 +517,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_I16, 
+             'V', mce, Ity_I16, 
              binop(Iop_Or16, 
-                   assignNew(mce, Ity_I16, unop(Iop_Not16, data)), 
+                   assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)), 
                    vbits) );
 }
 
@@ -510,9 +529,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_I32, 
+             'V', mce, Ity_I32, 
              binop(Iop_Or32, 
-                   assignNew(mce, Ity_I32, unop(Iop_Not32, data)), 
+                   assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)), 
                    vbits) );
 }
 
@@ -522,9 +541,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_I64, 
+             'V', mce, Ity_I64, 
              binop(Iop_Or64, 
-                   assignNew(mce, Ity_I64, unop(Iop_Not64, data)), 
+                   assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)), 
                    vbits) );
 }
 
@@ -534,9 +553,9 @@
    tl_assert(isShadowAtom(mce, vbits));
    tl_assert(sameKindedAtoms(data, vbits));
    return assignNew(
-             mce, Ity_V128, 
+             'V', mce, Ity_V128, 
              binop(Iop_OrV128, 
-                   assignNew(mce, Ity_V128, unop(Iop_NotV128, data)), 
+                   assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)), 
                    vbits) );
 }
 
@@ -553,14 +572,14 @@
 
    /* Fast-track some common cases */
    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
-      return assignNew(mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
+      return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 
    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
-      return assignNew(mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
+      return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 
    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
-      IRAtom* tmp = assignNew(mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
-      return assignNew(mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
+      IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
+      return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    }
 
    /* Else do it the slow way .. */
@@ -570,24 +589,24 @@
          tmp1 = vbits;
          break;
       case Ity_I8: 
-         tmp1 = assignNew(mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
+         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
          break;
       case Ity_I16: 
-         tmp1 = assignNew(mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
+         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
          break;
       case Ity_I32: 
-         tmp1 = assignNew(mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
+         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
          break;
       case Ity_I64: 
-         tmp1 = assignNew(mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
+         tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
          break;
       case Ity_I128: {
          /* Gah.  Chop it in half, OR the halves together, and compare
             that with zero. */
-         IRAtom* tmp2 = assignNew(mce, Ity_I64, unop(Iop_128HIto64, vbits));
-         IRAtom* tmp3 = assignNew(mce, Ity_I64, unop(Iop_128to64, vbits));
-         IRAtom* tmp4 = assignNew(mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
-         tmp1         = assignNew(mce, Ity_I1, 
+         IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
+         IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
+         IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
+         tmp1         = assignNew('V', mce, Ity_I1, 
                                        unop(Iop_CmpNEZ64, tmp4));
          break;
       }
@@ -601,20 +620,20 @@
       case Ity_I1:
          return tmp1;
       case Ity_I8: 
-         return assignNew(mce, Ity_I8, unop(Iop_1Sto8, tmp1));
+         return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
       case Ity_I16: 
-         return assignNew(mce, Ity_I16, unop(Iop_1Sto16, tmp1));
+         return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
       case Ity_I32: 
-         return assignNew(mce, Ity_I32, unop(Iop_1Sto32, tmp1));
+         return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
       case Ity_I64: 
-         return assignNew(mce, Ity_I64, unop(Iop_1Sto64, tmp1));
+         return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
       case Ity_V128:
-         tmp1 = assignNew(mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
-         tmp1 = assignNew(mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
+         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
+         tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
          return tmp1;
       case Ity_I128:
-         tmp1 = assignNew(mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
-         tmp1 = assignNew(mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
+         tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
+         tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
          return tmp1;
       default: 
          ppIRType(dst_ty);
@@ -705,23 +724,25 @@
    }
 
    naive 
-      = mkPCastTo(mce,ty, assignNew(mce, ty, binop(opUIFU, vxx, vyy)));
+      = mkPCastTo(mce,ty,
+                  assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
 
    vec 
       = assignNew(
-           mce,ty, 
+           'V', mce,ty, 
            binop( opOR,
-                  assignNew(mce,ty, binop(opOR, vxx, vyy)),
+                  assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
                   assignNew(
-                     mce,ty, 
+                     'V', mce,ty, 
                      unop( opNOT,
-                           assignNew(mce,ty, binop(opXOR, xx, yy))))));
+                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
 
    improvement_term
-      = mkPCastTo( mce,ty, assignNew(mce,Ity_I1, binop(opCMP, vec, top)));
+      = mkPCastTo( mce,ty,
+                   assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
 
    improved
-      = assignNew( mce,ty, binop(opDIFD, naive, improvement_term) );
+      = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
 
    final_cast
       = mkPCastTo( mce, Ity_I1, improved );
@@ -817,18 +838,18 @@
          binop(
             opOR,
             assignNew(
-               mce,ty,
+               'V', mce,ty,
                binop(
                   opAND,
                   mkPCastTo(mce,ty, xxhash), 
                   threeLeft1
                )),
             assignNew(
-               mce,ty,
+               'V', mce,ty,
                binop(
                   opSHL,
                   assignNew(
-                     mce,ty,
+                     'V', mce,ty,
                      binop(opSHR, xxhash, mkU8(width-1))),
                   mkU8(3)
                ))
@@ -851,6 +872,9 @@
 /*--- Emit a test and complaint if something is undefined. ---*/
 /*------------------------------------------------------------*/
 
+static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
+
+
 /* Set the annotations on a dirty helper to indicate that the stack
    pointer and instruction pointers might be read.  This is the
    behaviour of all 'emit-a-complaint' style functions we might
@@ -884,9 +908,14 @@
    Int      sz;
    IRDirty* di;
    IRAtom*  cond;
+   IRAtom*  origin;
+   void*    fn;
+   HChar*   nm;
+   IRExpr** args;
+   Int      nargs;
 
    // Don't do V bit tests if we're not reporting undefined value errors.
-   if (!MC_(clo_undef_value_errors))
+   if (MC_(clo_mc_level) == 1)
       return;
 
    /* Since the original expression is atomic, there's no duplicated
@@ -906,51 +935,103 @@
    cond = mkPCastTo( mce, Ity_I1, vatom );
    /* cond will be 0 if all defined, and 1 if any not defined. */
 
+   /* Get the origin info for the value we are about to check.  At
+      least, if we are doing origin tracking.  If not, use a dummy
+      zero origin. */
+   if (MC_(clo_mc_level) == 3) {
+      origin = schemeE( mce, atom );
+      if (mce->hWordTy == Ity_I64) {
+         origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
+      }
+   } else {
+      origin = NULL;
+   }
+
+   fn    = NULL;
+   nm    = NULL;
+   args  = NULL;
+   nargs = -1;
+
    switch (sz) {
       case 0:
-         di = unsafeIRDirty_0_N( 
-                 0/*regparms*/, 
-                 "MC_(helperc_value_check0_fail)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_value_check0_fail) ),
-                 mkIRExprVec_0() 
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_check0_fail_w_o);
+            nm    = "MC_(helperc_value_check0_fail_w_o)";
+            args  = mkIRExprVec_1(origin);
+            nargs = 1;
+         } else {
+            fn    = &MC_(helperc_value_check0_fail_no_o);
+            nm    = "MC_(helperc_value_check0_fail_no_o)";
+            args  = mkIRExprVec_0();
+            nargs = 0;
+         }
          break;
       case 1:
-         di = unsafeIRDirty_0_N( 
-                 0/*regparms*/, 
-                 "MC_(helperc_value_check1_fail)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_value_check1_fail) ),
-                 mkIRExprVec_0() 
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_check1_fail_w_o);
+            nm    = "MC_(helperc_value_check1_fail_w_o)";
+            args  = mkIRExprVec_1(origin);
+            nargs = 1;
+         } else {
+            fn    = &MC_(helperc_value_check1_fail_no_o);
+            nm    = "MC_(helperc_value_check1_fail_no_o)";
+            args  = mkIRExprVec_0();
+            nargs = 0;
+         }
          break;
       case 4:
-         di = unsafeIRDirty_0_N( 
-                 0/*regparms*/, 
-                 "MC_(helperc_value_check4_fail)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_value_check4_fail) ),
-                 mkIRExprVec_0() 
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_check4_fail_w_o);
+            nm    = "MC_(helperc_value_check4_fail_w_o)";
+            args  = mkIRExprVec_1(origin);
+            nargs = 1;
+         } else {
+            fn    = &MC_(helperc_value_check4_fail_no_o);
+            nm    = "MC_(helperc_value_check4_fail_no_o)";
+            args  = mkIRExprVec_0();
+            nargs = 0;
+         }
          break;
       case 8:
-         di = unsafeIRDirty_0_N( 
-                 0/*regparms*/, 
-                 "MC_(helperc_value_check8_fail)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_value_check8_fail) ),
-                 mkIRExprVec_0() 
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_check8_fail_w_o);
+            nm    = "MC_(helperc_value_check8_fail_w_o)";
+            args  = mkIRExprVec_1(origin);
+            nargs = 1;
+         } else {
+            fn    = &MC_(helperc_value_check8_fail_no_o);
+            nm    = "MC_(helperc_value_check8_fail_no_o)";
+            args  = mkIRExprVec_0();
+            nargs = 0;
+         }
          break;
       default:
-         di = unsafeIRDirty_0_N( 
-                 1/*regparms*/, 
-                 "MC_(helperc_complain_undef)",
-                 VG_(fnptr_to_fnentry)( &MC_(helperc_complain_undef) ),
-                 mkIRExprVec_1( mkIRExpr_HWord( sz ))
-              );
+         if (origin) {
+            fn    = &MC_(helperc_value_checkN_fail_w_o);
+            nm    = "MC_(helperc_value_checkN_fail_w_o)";
+            args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
+            nargs = 2;
+         } else {
+            fn    = &MC_(helperc_value_checkN_fail_no_o);
+            nm    = "MC_(helperc_value_checkN_fail_no_o)";
+            args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
+            nargs = 1;
+         }
          break;
    }
+
+   tl_assert(fn);
+   tl_assert(nm);
+   tl_assert(args);
+   tl_assert(nargs >= 0 && nargs <= 2);
+   tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
+              || (MC_(clo_mc_level) == 2 && origin == NULL) );
+
+   di = unsafeIRDirty_0_N( nargs/*regparms*/, nm, 
+                           VG_(fnptr_to_fnentry)( fn ), args );
    di->guard = cond;
    setHelperAnns( mce, di );
-   stmt( mce->bb, IRStmt_Dirty(di));
+   stmt( 'V', mce, IRStmt_Dirty(di));
 
    /* Set the shadow tmp to be defined.  First, update the
       orig->shadow tmp mapping to reflect the fact that this shadow is
@@ -959,9 +1040,9 @@
    /* sameKindedAtoms ... */
    if (vatom->tag == Iex_RdTmp) {
       tl_assert(atom->tag == Iex_RdTmp);
-      newShadowTmp(mce, atom->Iex.RdTmp.tmp);
-      assign(mce->bb, findShadowTmp(mce, atom->Iex.RdTmp.tmp), 
-                      definedOfType(ty));
+      newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
+      assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), 
+                       definedOfType(ty));
    }
 }
 
@@ -1015,7 +1096,7 @@
    // Don't do shadow PUTs if we're not doing undefined value checking.
    // Their absence lets Vex's optimiser remove all the shadow computation
    // that they depend on, which includes GETs of the shadow registers.
-   if (!MC_(clo_undef_value_errors))
+   if (MC_(clo_mc_level) == 1)
       return;
    
    if (atom) {
@@ -1035,7 +1116,7 @@
       /* complainIfUndefined(mce, atom); */
    } else {
       /* Do a plain shadow Put. */
-      stmt( mce->bb, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ) );
+      stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ) );
    }
 }
 
@@ -1055,14 +1136,14 @@
    // Don't do shadow PUTIs if we're not doing undefined value checking.
    // Their absence lets Vex's optimiser remove all the shadow computation
    // that they depend on, which includes GETIs of the shadow registers.
-   if (!MC_(clo_undef_value_errors))
+   if (MC_(clo_mc_level) == 1)
       return;
    
    tl_assert(isOriginalAtom(mce,atom));
    vatom = expr2vbits( mce, atom );
    tl_assert(sameKindedAtoms(atom, vatom));
    ty   = descr->elemTy;
-   tyS  = shadowType(ty);
+   tyS  = shadowTypeV(ty);
    arrSize = descr->nElems * sizeofIRType(ty);
    tl_assert(ty != Ity_I1);
    tl_assert(isOriginalAtom(mce,ix));
@@ -1077,7 +1158,7 @@
       IRRegArray* new_descr 
          = mkIRRegArray( descr->base + mce->layout->total_sizeB, 
                          tyS, descr->nElems);
-      stmt( mce->bb, IRStmt_PutI( new_descr, ix, bias, vatom ));
+      stmt( 'V', mce, IRStmt_PutI( new_descr, ix, bias, vatom ));
    }
 }
 
@@ -1088,7 +1169,7 @@
 static 
 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
 {
-   IRType tyS = shadowType(ty);
+   IRType tyS = shadowTypeV(ty);
    tl_assert(ty != Ity_I1);
    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
       /* Always defined, return all zeroes of the relevant type */
@@ -1096,6 +1177,7 @@
    } else {
       /* return a cloned version of the Get that refers to the shadow
          area. */
+      /* FIXME: this isn't an atom! */
       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
    }
 }
@@ -1109,7 +1191,7 @@
                       IRRegArray* descr, IRAtom* ix, Int bias )
 {
    IRType ty   = descr->elemTy;
-   IRType tyS  = shadowType(ty);
+   IRType tyS  = shadowTypeV(ty);
    Int arrSize = descr->nElems * sizeofIRType(ty);
    tl_assert(ty != Ity_I1);
    tl_assert(isOriginalAtom(mce,ix));
@@ -1403,31 +1485,31 @@
    }
 
    // a_min = aa & ~qaa
-   a_min = assignNew(mce,ty, 
+   a_min = assignNew('V', mce,ty, 
                      binop(opAND, aa,
-                                  assignNew(mce,ty, unop(opNOT, qaa))));
+                                  assignNew('V', mce,ty, unop(opNOT, qaa))));
 
    // b_min = bb & ~qbb
-   b_min = assignNew(mce,ty, 
+   b_min = assignNew('V', mce,ty, 
                      binop(opAND, bb,
-                                  assignNew(mce,ty, unop(opNOT, qbb))));
+                                  assignNew('V', mce,ty, unop(opNOT, qbb))));
 
    // a_max = aa | qaa
-   a_max = assignNew(mce,ty, binop(opOR, aa, qaa));
+   a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
 
    // b_max = bb | qbb
-   b_max = assignNew(mce,ty, binop(opOR, bb, qbb));
+   b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
 
    if (add) {
       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
       return
-      assignNew(mce,ty,
+      assignNew('V', mce,ty,
          binop( opOR,
-                assignNew(mce,ty, binop(opOR, qaa, qbb)),
-                assignNew(mce,ty, 
+                assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
+                assignNew('V', mce,ty, 
                    binop( opXOR, 
-                          assignNew(mce,ty, binop(opADD, a_min, b_min)),
-                          assignNew(mce,ty, binop(opADD, a_max, b_max))
+                          assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
+                          assignNew('V', mce,ty, binop(opADD, a_max, b_max))
                    )
                 )
          )
@@ -1435,13 +1517,13 @@
    } else {
       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
       return
-      assignNew(mce,ty,
+      assignNew('V', mce,ty,
          binop( opOR,
-                assignNew(mce,ty, binop(opOR, qaa, qbb)),
-                assignNew(mce,ty, 
+                assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
+                assignNew('V', mce,ty, 
                    binop( opXOR, 
-                          assignNew(mce,ty, binop(opSUB, a_min, b_max)),
-                          assignNew(mce,ty, binop(opSUB, a_max, b_min))
+                          assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
+                          assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
                    )
                 )
          )
@@ -1488,9 +1570,9 @@
    tl_assert(sameKindedAtoms(qbb,bb));
    return 
       assignNew(
-         mce, ty,
+         'V', mce, ty,
          mkUifU( mce, ty,
-                 assignNew(mce, ty, binop(original_op, qaa, bb)),
+                 assignNew('V', mce, ty, binop(original_op, qaa, bb)),
                  mkPCastTo(mce, ty, qbb)
          )
    );
@@ -1505,37 +1587,37 @@
 
 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
 }
 
 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
 }
 
 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
 }
 
 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
+   return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
 }
 
 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
+   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
 }
 
 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
+   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
 }
 
 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
 {
-   return assignNew(mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
+   return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
 }
 
 
@@ -1582,7 +1664,7 @@
    tl_assert(isShadowAtom(mce, vatomX));
    tl_assert(isShadowAtom(mce, vatomY));
    at = mkUifUV128(mce, vatomX, vatomY);
-   at = assignNew(mce, Ity_V128, mkPCast32x4(mce, at));
+   at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
    return at;
 }
 
@@ -1591,7 +1673,7 @@
 {
    IRAtom* at;
    tl_assert(isShadowAtom(mce, vatomX));
-   at = assignNew(mce, Ity_V128, mkPCast32x4(mce, vatomX));
+   at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
    return at;
 }
 
@@ -1602,9 +1684,9 @@
    tl_assert(isShadowAtom(mce, vatomX));
    tl_assert(isShadowAtom(mce, vatomY));
    at = mkUifUV128(mce, vatomX, vatomY);
-   at = assignNew(mce, Ity_I32, unop(Iop_V128to32, at));
+   at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
    at = mkPCastTo(mce, Ity_I32, at);
-   at = assignNew(mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
+   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
    return at;
 }
 
@@ -1613,9 +1695,9 @@
 {
    IRAtom* at;
    tl_assert(isShadowAtom(mce, vatomX));
-   at = assignNew(mce, Ity_I32, unop(Iop_V128to32, vatomX));
+   at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
    at = mkPCastTo(mce, Ity_I32, at);
-   at = assignNew(mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
+   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
    return at;
 }
 
@@ -1628,7 +1710,7 @@
    tl_assert(isShadowAtom(mce, vatomX));
    tl_assert(isShadowAtom(mce, vatomY));
    at = mkUifUV128(mce, vatomX, vatomY);
-   at = assignNew(mce, Ity_V128, mkPCast64x2(mce, at));
+   at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
    return at;
 }
 
@@ -1637,7 +1719,7 @@
 {
    IRAtom* at;
    tl_assert(isShadowAtom(mce, vatomX));
-   at = assignNew(mce, Ity_V128, mkPCast64x2(mce, vatomX));
+   at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
    return at;
 }
 
@@ -1648,9 +1730,9 @@
    tl_assert(isShadowAtom(mce, vatomX));
    tl_assert(isShadowAtom(mce, vatomY));
    at = mkUifUV128(mce, vatomX, vatomY);
-   at = assignNew(mce, Ity_I64, unop(Iop_V128to64, at));
+   at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
    at = mkPCastTo(mce, Ity_I64, at);
-   at = assignNew(mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
+   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
    return at;
 }
 
@@ -1659,9 +1741,9 @@
 {
    IRAtom* at;
    tl_assert(isShadowAtom(mce, vatomX));
-   at = assignNew(mce, Ity_I64, unop(Iop_V128to64, vatomX));
+   at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
    at = mkPCastTo(mce, Ity_I64, at);
-   at = assignNew(mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
+   at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
    return at;
 }
 
@@ -1708,9 +1790,9 @@
    }
    tl_assert(isShadowAtom(mce,vatom1));
    tl_assert(isShadowAtom(mce,vatom2));
-   at1 = assignNew(mce, Ity_V128, pcast(mce, vatom1));
-   at2 = assignNew(mce, Ity_V128, pcast(mce, vatom2));
-   at3 = assignNew(mce, Ity_V128, binop(narrow_op, at1, at2));
+   at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
+   at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
+   at3 = assignNew('V', mce, Ity_V128, binop(narrow_op, at1, at2));
    return at3;
 }
 
@@ -1728,9 +1810,9 @@
    }
    tl_assert(isShadowAtom(mce,vatom1));
    tl_assert(isShadowAtom(mce,vatom2));
-   at1 = assignNew(mce, Ity_I64, pcast(mce, vatom1));
-   at2 = assignNew(mce, Ity_I64, pcast(mce, vatom2));
-   at3 = assignNew(mce, Ity_I64, binop(narrow_op, at1, at2));
+   at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
+   at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
+   at3 = assignNew('V', mce, Ity_I64, binop(narrow_op, at1, at2));
    return at3;
 }
 
@@ -1927,7 +2009,7 @@
       case Iop_ShlN8x8:
          /* Same scheme as with all other shifts. */
          complainIfUndefined(mce, atom2);
-         return assignNew(mce, Ity_I64, binop(op, vatom1, atom2));
+         return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
 
       case Iop_QNarrow32Sx2:
       case Iop_QNarrow16Sx4:
@@ -1979,7 +2061,7 @@
       case Iop_InterleaveHI8x8:
       case Iop_CatOddLanes16x4:
       case Iop_CatEvenLanes16x4:
-         return assignNew(mce, Ity_I64, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
 
       /* Perm8x8: rearrange values in left arg using steering values
         from right arg.  So rearrange the vbits in the same way but
@@ -1987,7 +2069,7 @@
       case Iop_Perm8x8:
          return mkUifU64(
                    mce,
-                   assignNew(mce, Ity_I64, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
                    mkPCast8x8(mce, vatom2)
                 );
 
@@ -2007,7 +2089,7 @@
             this is wrong now, scalar shifts are done properly lazily.
             Vector shifts should be fixed too. */
          complainIfUndefined(mce, atom2);
-         return assignNew(mce, Ity_V128, binop(op, vatom1, atom2));
+         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
 
       /* V x V shifts/rotates are done using the standard lazy scheme. */
       case Iop_Shl8x16:
@@ -2015,7 +2097,7 @@
       case Iop_Sar8x16:
       case Iop_Rol8x16:
          return mkUifUV128(mce,
-                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast8x16(mce,vatom2)
                 );
 
@@ -2024,7 +2106,7 @@
       case Iop_Sar16x8:
       case Iop_Rol16x8:
          return mkUifUV128(mce,
-                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast16x8(mce,vatom2)
                 );
 
@@ -2033,7 +2115,7 @@
       case Iop_Sar32x4:
       case Iop_Rol32x4:
          return mkUifUV128(mce,
-                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast32x4(mce,vatom2)
                 );
 
@@ -2163,7 +2245,7 @@
       case Iop_InterleaveHI32x4:
       case Iop_InterleaveHI16x8:
       case Iop_InterleaveHI8x16:
-         return assignNew(mce, Ity_V128, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
  
      /* Perm8x16: rearrange values in left arg using steering values
         from right arg.  So rearrange the vbits in the same way but
@@ -2171,7 +2253,7 @@
       case Iop_Perm8x16:
          return mkUifUV128(
                    mce,
-                   assignNew(mce, Ity_V128, binop(op, vatom1, atom2)),
+                   assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
                    mkPCast8x16(mce, vatom2)
                 );
 
@@ -2186,8 +2268,8 @@
       case Iop_MullEven16Sx8: {
          IRAtom* at;
          at = binary16Ix8(mce,vatom1,vatom2);
-         at = assignNew(mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
-         at = assignNew(mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
+         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
+         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
 	 return at;
       }
 
@@ -2196,8 +2278,8 @@
       case Iop_MullEven8Sx16: {
          IRAtom* at;
          at = binary8Ix16(mce,vatom1,vatom2);
-         at = assignNew(mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
-         at = assignNew(mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
+         at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
+         at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
 	 return at;
       }
 
@@ -2207,8 +2289,8 @@
          than a data steering operation. */
       case Iop_Narrow32x4: 
       case Iop_Narrow16x8: 
-         return assignNew(mce, Ity_V128, 
-                               binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_V128, 
+                                    binop(op, vatom1, vatom2));
 
       case Iop_ShrV128:
       case Iop_ShlV128:
@@ -2216,12 +2298,12 @@
             this is wrong now, scalar shifts are done properly lazily.
             Vector shifts should be fixed too. */
          complainIfUndefined(mce, atom2);
-         return assignNew(mce, Ity_V128, binop(op, vatom1, atom2));
+         return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
 
 
       /* I128-bit data-steering */
       case Iop_64HLto128:
-         return assignNew(mce, Ity_I128, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
 
       /* Scalar floating point */
 
@@ -2260,36 +2342,36 @@
          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
 
       case Iop_16HLto32:
-         return assignNew(mce, Ity_I32, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
       case Iop_32HLto64:
-         return assignNew(mce, Ity_I64, binop(op, vatom1, vatom2));
+         return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
 
       case Iop_MullS64:
       case Iop_MullU64: {
          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
-         return assignNew(mce, Ity_I128, binop(Iop_64HLto128, vHi64, vLo64));
+         return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, vHi64, vLo64));
       }
 
       case Iop_MullS32:
       case Iop_MullU32: {
          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
-         return assignNew(mce, Ity_I64, binop(Iop_32HLto64, vHi32, vLo32));
+         return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, vHi32, vLo32));
       }
 
       case Iop_MullS16:
       case Iop_MullU16: {
          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
-         return assignNew(mce, Ity_I32, binop(Iop_16HLto32, vHi16, vLo16));
+         return assignNew('V', mce, Ity_I32, binop(Iop_16HLto32, vHi16, vLo16));
       }
 
       case Iop_MullS8:
       case Iop_MullU8: {
          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
-         return assignNew(mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
+         return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
       }
 
       case Iop_DivS32:
@@ -2424,7 +2506,7 @@
       do_And_Or:
          return
          assignNew(
-            mce, 
+            'V', mce, 
             and_or_ty,
             difd(mce, uifu(mce, vatom1, vatom2),
                       difd(mce, improve(mce, atom1, vatom1),
@@ -2484,7 +2566,7 @@
       case Iop_Dup8x16:
       case Iop_Dup16x8:
       case Iop_Dup32x4:
-         return assignNew(mce, Ity_V128, unop(op, vatom));
+         return assignNew('V', mce, Ity_V128, unop(op, vatom));
 
       case Iop_F32toF64: 
       case Iop_I32toF64:
@@ -2511,7 +2593,7 @@
       case Iop_V128HIto64:
       case Iop_128HIto64:
       case Iop_128to64:
-         return assignNew(mce, Ity_I64, unop(op, vatom));
+         return assignNew('V', mce, Ity_I64, unop(op, vatom));
 
       case Iop_64to32:
       case Iop_64HIto32:
@@ -2522,27 +2604,27 @@
       case Iop_16Sto32:
       case Iop_8Sto32:
       case Iop_V128to32:
-         return assignNew(mce, Ity_I32, unop(op, vatom));
+         return assignNew('V', mce, Ity_I32, unop(op, vatom));
 
       case Iop_8Sto16:
       case Iop_8Uto16:
       case Iop_32to16:
       case Iop_32HIto16:
       case Iop_64to16:
-         return assignNew(mce, Ity_I16, unop(op, vatom));
+         return assignNew('V', mce, Ity_I16, unop(op, vatom));
 
       case Iop_1Uto8:
       case Iop_16to8:
       case Iop_16HIto8:
       case Iop_32to8:
       case Iop_64to8:
-         return assignNew(mce, Ity_I8, unop(op, vatom));
+         return assignNew('V', mce, Ity_I8, unop(op, vatom));
 
       case Iop_32to1:
-         return assignNew(mce, Ity_I1, unop(Iop_32to1, vatom));
+         return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
 
       case Iop_64to1:
-         return assignNew(mce, Ity_I1, unop(Iop_64to1, vatom));
+         return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
 
       case Iop_ReinterpF64asI64:
       case Iop_ReinterpI64asF64:
@@ -2583,7 +2665,7 @@
 
    /* Now cook up a call to the relevant helper function, to read the
       data V bits from shadow memory. */
-   ty = shadowType(ty);
+   ty = shadowTypeV(ty);
 
    if (end == Iend_LE) {   
       switch (ty) {
@@ -2631,7 +2713,7 @@
       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
-      addrAct = assignNew(mce, tyAddr, binop(mkAdd, addr, eBias) );
+      addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
    }
 
    /* We need to have a place to park the V bits we're just about to
@@ -2642,7 +2724,7 @@
                            hname, VG_(fnptr_to_fnentry)( helper ), 
                            mkIRExprVec_1( addrAct ));
    setHelperAnns( mce, di );
-   stmt( mce->bb, IRStmt_Dirty(di) );
+   stmt( 'V', mce, IRStmt_Dirty(di) );
 
    return mkexpr(datavbits);
 }
@@ -2655,7 +2737,7 @@
 {
    IRAtom *v64hi, *v64lo;
    tl_assert(end == Iend_LE || end == Iend_BE);
-   switch (shadowType(ty)) {
+   switch (shadowTypeV(ty)) {
       case Ity_I8: 
       case Ity_I16: 
       case Ity_I32: 
@@ -2669,7 +2751,7 @@
             v64hi = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias);
             v64lo = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+8);
          }
-         return assignNew( mce, 
+         return assignNew( 'V', mce, 
                            Ity_V128, 
                            binop(Iop_64HLtoV128, v64hi, v64lo));
       default:
@@ -2699,7 +2781,8 @@
    ty = typeOfIRExpr(mce->bb->tyenv, vbits0);
 
    return
-      mkUifU(mce, ty, assignNew(mce, ty, IRExpr_Mux0X(cond, vbits0, vbitsX)),
+      mkUifU(mce, ty, assignNew('V', mce, ty, 
+                                     IRExpr_Mux0X(cond, vbits0, vbitsX)),
                       mkPCastTo(mce, ty, vbitsC) );
 }      
 
@@ -2718,10 +2801,10 @@
                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
 
       case Iex_RdTmp:
-         return IRExpr_RdTmp( findShadowTmp(mce, e->Iex.RdTmp.tmp) );
+         return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
 
       case Iex_Const:
-         return definedOfType(shadowType(typeOfIRExpr(mce->bb->tyenv, e)));
+         return definedOfType(shadowTypeV(typeOfIRExpr(mce->bb->tyenv, e)));
 
       case Iex_Qop:
          return expr2vbits_Qop(
@@ -2789,20 +2872,28 @@
 
    if (tyH == Ity_I32) {
       switch (ty) {
-         case Ity_I32: return vatom;
-         case Ity_I16: return assignNew(mce, tyH, unop(Iop_16Uto32, vatom));
-         case Ity_I8:  return assignNew(mce, tyH, unop(Iop_8Uto32, vatom));
-         default:      goto unhandled;
+         case Ity_I32:
+            return vatom;
+         case Ity_I16:
+            return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
+         case Ity_I8:
+            return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
+         default:
+            goto unhandled;
       }
    } else
    if (tyH == Ity_I64) {
       switch (ty) {
-         case Ity_I32: return assignNew(mce, tyH, unop(Iop_32Uto64, vatom));
-         case Ity_I16: return assignNew(mce, tyH, unop(Iop_32Uto64, 
-                              assignNew(mce, Ity_I32, unop(Iop_16Uto32, vatom))));
-         case Ity_I8:  return assignNew(mce, tyH, unop(Iop_32Uto64, 
-                              assignNew(mce, Ity_I32, unop(Iop_8Uto32, vatom))));
-         default:      goto unhandled;
+         case Ity_I32:
+            return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
+         case Ity_I16:
+            return assignNew('V', mce, tyH, unop(Iop_32Uto64, 
+                   assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
+         case Ity_I8:
+            return assignNew('V', mce, tyH, unop(Iop_32Uto64, 
+                   assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
+         default:
+            goto unhandled;
       }
    } else {
       goto unhandled;
@@ -2860,7 +2951,7 @@
    // If we're not doing undefined value checking, pretend that this value
    // is "all valid".  That lets Vex's optimiser remove some of the V bit
    // shadow computation ops that precede it.
-   if (!MC_(clo_undef_value_errors)) {
+   if (MC_(clo_mc_level) == 1) {
       switch (ty) {
          case Ity_V128: c = IRConst_V128(V_BITS16_DEFINED); break; // V128 weirdness
          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
@@ -2930,16 +3021,16 @@
       }
 
       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
-      addrLo64  = assignNew(mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
-      vdataLo64 = assignNew(mce, Ity_I64, unop(Iop_V128to64, vdata));
+      addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
+      vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
       diLo64    = unsafeIRDirty_0_N( 
                      1/*regparms*/, 
                      hname, VG_(fnptr_to_fnentry)( helper ), 
                      mkIRExprVec_2( addrLo64, vdataLo64 )
                   );
       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
-      addrHi64  = assignNew(mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
-      vdataHi64 = assignNew(mce, Ity_I64, unop(Iop_V128HIto64, vdata));
+      addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
+      vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
       diHi64    = unsafeIRDirty_0_N( 
                      1/*regparms*/, 
                      hname, VG_(fnptr_to_fnentry)( helper ), 
@@ -2947,8 +3038,8 @@
                   );
       setHelperAnns( mce, diLo64 );
       setHelperAnns( mce, diHi64 );
-      stmt( mce->bb, IRStmt_Dirty(diLo64) );
-      stmt( mce->bb, IRStmt_Dirty(diHi64) );
+      stmt( 'V', mce, IRStmt_Dirty(diLo64) );
+      stmt( 'V', mce, IRStmt_Dirty(diHi64) );
 
    } else {
 
@@ -2958,7 +3049,7 @@
          addrAct = addr;
       } else {
          eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
-         addrAct = assignNew(mce, tyAddr, binop(mkAdd, addr, eBias) );
+         addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
       }
 
       if (ty == Ity_I64) {
@@ -2979,7 +3070,7 @@
               );
       }
       setHelperAnns( mce, di );
-      stmt( mce->bb, IRStmt_Dirty(di) );
+      stmt( 'V', mce, IRStmt_Dirty(di) );
    }
 
 }
@@ -3060,8 +3151,8 @@
          /* update 'curr' with UifU of the state slice 
             gOff .. gOff+n-1 */
          tySrc = szToITy( n );
-         src   = assignNew( mce, tySrc, 
-                            shadow_GET(mce, gOff, tySrc ) );
+         src   = assignNew( 'V', mce, tySrc, 
+                                 shadow_GET(mce, gOff, tySrc ) );
          here = mkPCastTo( mce, Ity_I32, src );
          curr = mkUifU32(mce, here, curr);
          gSz -= n;
@@ -3123,9 +3214,9 @@
 
    /* Outputs: the destination temporary, if there is one. */
    if (d->tmp != IRTemp_INVALID) {
-      dst   = findShadowTmp(mce, d->tmp);
+      dst   = findShadowTmpV(mce, d->tmp);
       tyDst = typeOfIRTemp(mce->bb->tyenv, d->tmp);
-      assign( mce->bb, dst, mkPCastTo( mce, tyDst, curr) );
+      assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
    }
 
    /* Outputs: guest state that we write or modify. */
@@ -3185,19 +3276,28 @@
    notify the A/V bit machinery of this fact.
 
    We call 
-   void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len );
+   void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
+                                                    Addr nia );
 */
 static
-void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len )
+void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
 {
    IRDirty* di;
+   /* Minor optimisation: if not doing origin tracking, ignore the
+      supplied nia and pass zero instead.  This is on the basis that
+      MC_(helperc_MAKE_STACK_UNINIT) will ignore it anyway, and we can
+      almost always generate a shorter instruction to put zero into a
+      register than any other value. */
+   if (MC_(clo_mc_level) < 3)
+      nia = mkIRExpr_HWord(0);
+
    di = unsafeIRDirty_0_N(
            0/*regparms*/,
            "MC_(helperc_MAKE_STACK_UNINIT)",
            VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT) ),
-           mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
+           mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
         );
-   stmt( mce->bb, IRStmt_Dirty(di) );
+   stmt( 'V', mce, IRStmt_Dirty(di) );
 }
 
 
@@ -3205,6 +3305,8 @@
 /*--- Memcheck main                                        ---*/
 /*------------------------------------------------------------*/
 
+static void schemeS ( MCEnv* mce, IRStmt* st );
+
 static Bool isBogusAtom ( IRAtom* at )
 {
    ULong n = 0;
@@ -3301,7 +3403,8 @@
       case Ist_Exit:
          return isBogusAtom(st->Ist.Exit.guard);
       case Ist_AbiHint:
-         return isBogusAtom(st->Ist.AbiHint.base);
+         return isBogusAtom(st->Ist.AbiHint.base)
+                || isBogusAtom(st->Ist.AbiHint.nia);
       case Ist_NoOp:
       case Ist_IMark:
       case Ist_MBE:
@@ -3320,7 +3423,7 @@
                         VexGuestExtents* vge,
                         IRType gWordTy, IRType hWordTy )
 {
-   Bool    verboze = False; //True; 
+   Bool    verboze = 0||False;
    Bool    bogus;
    Int     i, j, first_stmt;
    IRStmt* st;
@@ -3333,12 +3436,16 @@
    }
 
    /* Check we're not completely nuts */
-   tl_assert(sizeof(UWord) == sizeof(void*));
-   tl_assert(sizeof(Word)  == sizeof(void*));
-   tl_assert(sizeof(ULong) == 8);
-   tl_assert(sizeof(Long)  == 8);
-   tl_assert(sizeof(UInt)  == 4);
-   tl_assert(sizeof(Int)   == 4);
+   tl_assert(sizeof(UWord)  == sizeof(void*));
+   tl_assert(sizeof(Word)   == sizeof(void*));
+   tl_assert(sizeof(Addr)   == sizeof(void*));
+   tl_assert(sizeof(ULong)  == 8);
+   tl_assert(sizeof(Long)   == 8);
+   tl_assert(sizeof(Addr64) == 8);
+   tl_assert(sizeof(UInt)   == 4);
+   tl_assert(sizeof(Int)    == 4);
+
+   tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
 
    /* Set up SB */
    bb = deepCopyIRSBExceptStmts(bb_in);
@@ -3346,13 +3453,17 @@
    /* Set up the running environment.  Only .bb is modified as we go
       along. */
    mce.bb             = bb;
+   mce.trace          = verboze;
    mce.layout         = layout;
    mce.n_originalTmps = bb->tyenv->types_used;
    mce.hWordTy        = hWordTy;
    mce.bogusLiterals  = False;
-   mce.tmpMap         = LibVEX_Alloc(mce.n_originalTmps * sizeof(IRTemp));
-   for (i = 0; i < mce.n_originalTmps; i++)
-      mce.tmpMap[i] = IRTemp_INVALID;
+   mce.tmpMapV        = LibVEX_Alloc(mce.n_originalTmps * sizeof(IRTemp));
+   mce.tmpMapB        = LibVEX_Alloc(mce.n_originalTmps * sizeof(IRTemp));
+   for (i = 0; i < mce.n_originalTmps; i++) {
+      mce.tmpMapV[i] = IRTemp_INVALID;
+      mce.tmpMapB[i] = IRTemp_INVALID;
+   }
 
    /* Make a preliminary inspection of the statements, to see if there
       are any dodgy-looking literals.  If there are, we generate
@@ -3392,7 +3503,7 @@
       tl_assert(st);
       tl_assert(isFlatIRStmt(st));
 
-      addStmtToIRSB( bb, bb_in->stmts[i] );
+      stmt( 'C', &mce, bb_in->stmts[i] );
       i++;
    }
 
@@ -3416,12 +3527,12 @@
    */
    for (j = 0; j < i; j++) {
       if (bb_in->stmts[j]->tag == Ist_WrTmp) {
-         /* findShadowTmp checks its arg is an original tmp;
+         /* findShadowTmpV checks its arg is an original tmp;
             no need to assert that here. */
          IRTemp tmp_o = bb_in->stmts[j]->Ist.WrTmp.tmp;
-         IRTemp tmp_s = findShadowTmp(&mce, tmp_o);
+         IRTemp tmp_s = findShadowTmpV(&mce, tmp_o);
          IRType ty_s  = typeOfIRTemp(bb->tyenv, tmp_s);
-         assign( bb, tmp_s, definedOfType( ty_s ) );
+         assign( 'V', &mce, tmp_s, definedOfType( ty_s ) );
          if (0) {
             VG_(printf)("create shadow tmp for preamble tmp [%d] ty ", j);
             ppIRType( ty_s );
@@ -3443,17 +3554,21 @@
       first_stmt = bb->stmts_used;
 
       if (verboze) {
+         VG_(printf)("\n");
          ppIRStmt(st);
-         VG_(printf)("\n\n");
+         VG_(printf)("\n");
       }
 
+      if (MC_(clo_mc_level) == 3)
+         schemeS( &mce, st );
+
       /* Generate instrumentation code for each stmt ... */
 
       switch (st->tag) {
 
          case Ist_WrTmp:
-            assign( bb, findShadowTmp(&mce, st->Ist.WrTmp.tmp), 
-                        expr2vbits( &mce, st->Ist.WrTmp.data) );
+            assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp), 
+                               expr2vbits( &mce, st->Ist.WrTmp.data) );
             break;
 
          case Ist_Put:
@@ -3482,8 +3597,10 @@
             complainIfUndefined( &mce, st->Ist.Exit.guard );
             break;
 
-         case Ist_NoOp:
          case Ist_IMark:
+            break;
+
+         case Ist_NoOp:
          case Ist_MBE:
             break;
 
@@ -3492,7 +3609,9 @@
             break;
 
          case Ist_AbiHint:
-            do_AbiHint( &mce, st->Ist.AbiHint.base, st->Ist.AbiHint.len );
+            do_AbiHint( &mce, st->Ist.AbiHint.base,
+                              st->Ist.AbiHint.len,
+                              st->Ist.AbiHint.nia );
             break;
 
          default:
@@ -3503,7 +3622,7 @@
 
       } /* switch (st->tag) */
 
-      if (verboze) {
+      if (0 && verboze) {
          for (j = first_stmt; j < bb->stmts_used; j++) {
             VG_(printf)("   ");
             ppIRStmt(bb->stmts[j]);
@@ -3513,7 +3632,7 @@
       }
 
       /* ... and finally copy the stmt itself to the output. */
-      addStmtToIRSB(bb, st);
+      stmt('C', &mce, st);
 
    }
 
@@ -3528,7 +3647,7 @@
 
    complainIfUndefined( &mce, bb->next );
 
-   if (verboze) {
+   if (0 && verboze) {
       for (j = first_stmt; j < bb->stmts_used; j++) {
          VG_(printf)("   ");
          ppIRStmt(bb->stmts[j]);
@@ -3547,7 +3666,7 @@
 /* This exploits the observation that Memcheck often produces
    repeated conditional calls of the form
 
-   Dirty G MC_(helperc_value_check0/1/4/8_fail)()
+   Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
 
    with the same guard expression G guarding the same helper call.
    The second and subsequent calls are redundant.  This usually
@@ -3636,10 +3755,14 @@
 static Bool is_helperc_value_checkN_fail ( HChar* name )
 {
    return
-      0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail)")
-      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail)")
-      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail)")
-      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail)");
+      0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_no_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_no_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_no_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_no_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_w_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_w_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_w_o)")
+      || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_w_o)");
 }
 
 IRSB* MC_(final_tidy) ( IRSB* sb_in )
@@ -3683,6 +3806,509 @@
 }
 
 
+/*------------------------------------------------------------*/
+/*--- Origin tracking stuff                                ---*/
+/*------------------------------------------------------------*/
+
+static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
+{
+   tl_assert(orig < mce->n_originalTmps);
+   if (mce->tmpMapB[orig] == IRTemp_INVALID) {
+      mce->tmpMapB[orig] 
+         = newIRTemp(mce->bb->tyenv, Ity_I32);
+   }
+   return mce->tmpMapB[orig];
+}
+
+static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
+{
+   return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
+}
+
+static IRAtom* gen_load_b ( MCEnv* mce, Int szB, 
+                            IRAtom* baseaddr, Int offset )
+{
+   void*    hFun;
+   HChar*   hName;
+   IRTemp   bTmp;
+   IRDirty* di;
+   IRType   aTy   = typeOfIRExpr( mce->bb->tyenv, baseaddr );
+   IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
+   IRAtom*  ea    = baseaddr;
+   if (offset != 0) {
+      IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
+                                   : mkU64( (Long)(Int)offset );
+      ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
+   }
+   bTmp = newIRTemp(mce->bb->tyenv, mce->hWordTy);
+
+   switch (szB) {
+      case 1: hFun  = (void*)&MC_(helperc_b_load1);
+              hName = "MC_(helperc_b_load1)";
+              break;
+      case 2: hFun  = (void*)&MC_(helperc_b_load2);
+              hName = "MC_(helperc_b_load2)";
+              break;
+      case 4: hFun  = (void*)&MC_(helperc_b_load4);
+              hName = "MC_(helperc_b_load4)";
+              break;
+      case 8: hFun  = (void*)&MC_(helperc_b_load8);
+              hName = "MC_(helperc_b_load8)";
+              break;
+      case 16: hFun  = (void*)&MC_(helperc_b_load16);
+               hName = "MC_(helperc_b_load16)";
+               break;
+      default:
+         VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
+         tl_assert(0);
+   }
+   di = unsafeIRDirty_1_N(
+           bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
+           mkIRExprVec_1( ea )
+        );
+   /* no need to mess with any annotations.  This call accesses
+      neither guest state nor guest memory. */
+   stmt( 'B', mce, IRStmt_Dirty(di) );
+   if (mce->hWordTy == Ity_I64) {
+      /* 64-bit host */
+      IRTemp bTmp32 = newIRTemp(mce->bb->tyenv, Ity_I32);
+      assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
+      return mkexpr(bTmp32);
+   } else {
+      /* 32-bit host */
+      return mkexpr(bTmp);
+   }
+}
+static void gen_store_b ( MCEnv* mce, Int szB,
+                          IRAtom* baseaddr, Int offset, IRAtom* dataB )
+{
+   void*    hFun;
+   HChar*   hName;
+   IRDirty* di;
+   IRType   aTy   = typeOfIRExpr( mce->bb->tyenv, baseaddr );
+   IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
+   IRAtom*  ea    = baseaddr;
+   if (offset != 0) {
+      IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
+                                   : mkU64( (Long)(Int)offset );
+      ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
+   }
+   if (mce->hWordTy == Ity_I64)
+      dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
+
+   switch (szB) {
+      case 1: hFun  = (void*)&MC_(helperc_b_store1);
+              hName = "MC_(helperc_b_store1)";
+              break;
+      case 2: hFun  = (void*)&MC_(helperc_b_store2);
+              hName = "MC_(helperc_b_store2)";
+              break;
+      case 4: hFun  = (void*)&MC_(helperc_b_store4);
+              hName = "MC_(helperc_b_store4)";
+              break;
+      case 8: hFun  = (void*)&MC_(helperc_b_store8);
+              hName = "MC_(helperc_b_store8)";
+              break;
+      case 16: hFun  = (void*)&MC_(helperc_b_store16);
+               hName = "MC_(helperc_b_store16)";
+               break;
+      default:
+         tl_assert(0);
+   }
+   di = unsafeIRDirty_0_N( 2/*regparms*/,
+           hName, VG_(fnptr_to_fnentry)( hFun ),
+           mkIRExprVec_2( ea, dataB )
+        );
+   /* no need to mess with any annotations.  This call accesses
+      neither guest state nor guest memory. */
+   stmt( 'B', mce, IRStmt_Dirty(di) );
+}
+
+static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
+   IRType eTy = typeOfIRExpr(mce->bb->tyenv, e);
+   if (eTy == Ity_I64)
+      return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
+   if (eTy == Ity_I32)
+      return e;
+   tl_assert(0);
+}
+
+static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
+   IRType eTy = typeOfIRExpr(mce->bb->tyenv, e);
+   tl_assert(eTy == Ity_I32);
+   if (dstTy == Ity_I64)
+      return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
+   tl_assert(0);
+}
+
+static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
+{
+   tl_assert(MC_(clo_mc_level) == 3);
+
+   switch (e->tag) {
+
+      case Iex_GetI: {
+         IRRegArray* descr_b;
+         IRAtom      *t1, *t2, *t3, *t4;
+         IRRegArray* descr      = e->Iex.GetI.descr;
+         IRType equivIntTy 
+            = MC_(get_otrack_reg_array_equiv_int_type)(descr);
+         /* If this array is unshadowable for whatever reason, use the
+            usual approximation. */
+         if (equivIntTy == Ity_INVALID)
+            return mkU32(0);
+         tl_assert(sizeofIRType(equivIntTy) >= 4);
+         tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
+         descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
+                                 equivIntTy, descr->nElems );
+         /* Do a shadow indexed get of the same size, giving t1.  Take
+            the bottom 32 bits of it, giving t2.  Compute into t3 the
+            origin for the index (almost certainly zero, but there's
+            no harm in being completely general here, since iropt will
+            remove any useless code), and fold it in, giving a final
+            value t4. */
+         t1 = assignNew( 'B', mce, equivIntTy, 
+                          IRExpr_GetI( descr_b, e->Iex.GetI.ix, 
+                                                e->Iex.GetI.bias ));
+         t2 = narrowTo32( mce, t1 );
+         t3 = schemeE( mce, e->Iex.GetI.ix );
+         t4 = gen_maxU32( mce, t2, t3 );
+         return t4;
+      }
+      case Iex_CCall: {
+         Int i;
+         IRAtom*  here;
+         IRExpr** args = e->Iex.CCall.args;
+         IRAtom*  curr = mkU32(0);
+         for (i = 0; args[i]; i++) {
+            tl_assert(i < 32);
+            tl_assert(isOriginalAtom(mce, args[i]));
+            /* Only take notice of this arg if the callee's
+               mc-exclusion mask does not say it is to be excluded. */
+            if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
+               /* the arg is to be excluded from definedness checking.
+                  Do nothing. */
+               if (0) VG_(printf)("excluding %s(%d)\n",
+                                  e->Iex.CCall.cee->name, i);
+            } else {
+               /* calculate the arg's definedness, and pessimistically
+                  merge it in. */
+               here = schemeE( mce, args[i] );
+               curr = gen_maxU32( mce, curr, here );
+            }
+         }
+         return curr;
+      }
+      case Iex_Load: {
+         Int dszB;
+         dszB = sizeofIRType(e->Iex.Load.ty);
+         /* assert that the B value for the address is already
+            available (somewhere) */
+         tl_assert(isIRAtom(e->Iex.Load.addr));
+         tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
+         return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
+      }
+      case Iex_Mux0X: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Mux0X.cond );
+         IRAtom* b2 = schemeE( mce, e->Iex.Mux0X.expr0 );
+         IRAtom* b3 = schemeE( mce, e->Iex.Mux0X.exprX );
+         return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
+      }
+      case Iex_Qop: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Qop.arg1 );
+         IRAtom* b2 = schemeE( mce, e->Iex.Qop.arg2 );
+         IRAtom* b3 = schemeE( mce, e->Iex.Qop.arg3 );
+         IRAtom* b4 = schemeE( mce, e->Iex.Qop.arg4 );
+         return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
+                                 gen_maxU32( mce, b3, b4 ) );
+      }
+      case Iex_Triop: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Triop.arg1 );
+         IRAtom* b2 = schemeE( mce, e->Iex.Triop.arg2 );
+         IRAtom* b3 = schemeE( mce, e->Iex.Triop.arg3 );
+         return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
+      }
+      case Iex_Binop: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
+         IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
+         return gen_maxU32( mce, b1, b2 );
+      }
+      case Iex_Unop: {
+         IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
+         return b1;
+      }
+      case Iex_Const:
+         return mkU32(0);
+      case Iex_RdTmp:
+         return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
+      case Iex_Get: {
+         Int b_offset = MC_(get_otrack_shadow_offset)( 
+                           e->Iex.Get.offset,
+                           sizeofIRType(e->Iex.Get.ty) 
+                        );
+         tl_assert(b_offset >= -1
+                   && b_offset <= mce->layout->total_sizeB -4);
+         if (b_offset >= 0) {
+            /* FIXME: this isn't an atom! */
+            return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
+                               Ity_I32 );
+         }
+         return mkU32(0);
+      }
+      default:
+         VG_(printf)("mc_translate.c: schemeE: unhandled: ");
+         ppIRExpr(e); 
+         VG_(tool_panic)("memcheck:schemeE");
+   }
+}
+
+static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
+{
+   // This is a hacked version of do_shadow_Dirty
+   Int       i, n, offset, toDo, gSz, gOff;
+   IRAtom    *here, *curr;
+   IRTemp    dst;
+   IREndness end;
+
+   /* What's the native endianness?  We need to know this. */
+#  if defined(VG_BIGENDIAN)
+   end = Iend_BE;
+#  elif defined(VG_LITTLEENDIAN)
+   end = Iend_LE;
+#  else
+#    error "Unknown endianness"
+#  endif
+
+   /* First check the guard. */
+   curr = schemeE( mce, d->guard );
+
+   /* Now round up all inputs and maxU32 over them. */
+
+   /* Inputs: unmasked args */
+   for (i = 0; d->args[i]; i++) {
+      if (d->cee->mcx_mask & (1<<i)) {
+         /* ignore this arg */
+      } else {
+         here = schemeE( mce, d->args[i] );
+         curr = gen_maxU32( mce, curr, here );
+      }
+   }
+
+   /* Inputs: guest state that we read. */
+   for (i = 0; i < d->nFxState; i++) {
+      tl_assert(d->fxState[i].fx != Ifx_None);
+      if (d->fxState[i].fx == Ifx_Write)
+         continue;
+
+      /* Ignore any sections marked as 'always defined'. */
+      if (isAlwaysDefd(mce, d->fxState[i].offset, d->fxState[i].size )) {
+         if (0)
+         VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
+                     d->fxState[i].offset, d->fxState[i].size );
+         continue;
+      }
+
+      /* This state element is read or modified.  So we need to
+         consider it.  If larger than 4 bytes, deal with it in 4-byte
+         chunks. */
+      gSz  = d->fxState[i].size;
+      gOff = d->fxState[i].offset;
+      tl_assert(gSz > 0);
+      while (True) {
+         Int b_offset;
+         if (gSz == 0) break;
+         n = gSz <= 4 ? gSz : 4;
+         /* update 'curr' with maxU32 of the state slice 
+            gOff .. gOff+n-1 */
+         b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
+         if (b_offset != -1) {
+            here = assignNew( 'B',mce,
+                               Ity_I32,
+                               IRExpr_Get(b_offset + 2*mce->layout->total_sizeB,
+                                          Ity_I32));
+            curr = gen_maxU32( mce, curr, here );
+         }
+         gSz -= n;
+         gOff += n;
+      }
+
+   }
+
+   /* Inputs: memory */
+
+   if (d->mFx != Ifx_None) {
+      /* Because we may do multiple shadow loads/stores from the same
+         base address, it's best to do a single test of its
+         definedness right now.  Post-instrumentation optimisation
+         should remove all but this test. */
+      tl_assert(d->mAddr);
+      here = schemeE( mce, d->mAddr );
+      curr = gen_maxU32( mce, curr, here );
+   }
+
+   /* Deal with memory inputs (reads or modifies) */
+   if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
+      offset = 0;
+      toDo   = d->mSize;
+      /* chew off 32-bit chunks.  We don't care about the endianness
+         since it's all going to be condensed down to a single bit,
+         but nevertheless choose an endianness which is hopefully
+         native to the platform. */
+      while (toDo >= 4) {
+         here = gen_load_b( mce, 4, d->mAddr, d->mSize - toDo );
+         curr = gen_maxU32( mce, curr, here );
+         toDo -= 4;
+      }
+      if (toDo != 0)
+         VG_(printf)("Approx: do_origins_Dirty(R): missed %d bytes\n",
+                     (Int)toDo );
+      //tl_assert(toDo == 0); /* also need to handle 1,2-byte excess */
+   }
+
+   /* Whew!  So curr is a 32-bit B-value which should give an origin
+      of some use if any of the inputs to the helper are undefined.
+      Now we need to re-distribute the results to all destinations. */
+
+   /* Outputs: the destination temporary, if there is one. */
+   if (d->tmp != IRTemp_INVALID) {
+      dst   = findShadowTmpB(mce, d->tmp);
+      assign( 'V', mce, dst, curr );
+   }
+
+   /* Outputs: guest state that we write or modify. */
+   for (i = 0; i < d->nFxState; i++) {
+      tl_assert(d->fxState[i].fx != Ifx_None);
+      if (d->fxState[i].fx == Ifx_Read)
+         continue;
+
+      /* Ignore any sections marked as 'always defined'. */
+      if (isAlwaysDefd(mce, d->fxState[i].offset, d->fxState[i].size ))
+         continue;
+
+      /* This state element is written or modified.  So we need to
+         consider it.  If larger than 4 bytes, deal with it in 4-byte
+         chunks. */
+      gSz  = d->fxState[i].size;
+      gOff = d->fxState[i].offset;
+      tl_assert(gSz > 0);
+      while (True) {
+         Int b_offset;
+         if (gSz == 0) break;
+         n = gSz <= 4 ? gSz : 4;
+         /* Write 'curr' to the state slice gOff .. gOff+n-1 */
+         b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
+         if (b_offset != -1) {
+           stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
+                                      curr ));
+         }
+         gSz -= n;
+         gOff += n;
+      }
+   }
+
+   /* Outputs: memory that we write or modify.  Same comments about
+      endianness as above apply. */
+   if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
+      offset = 0;
+      toDo   = d->mSize;
+      /* chew off 32-bit chunks */
+      while (toDo >= 4) {
+         gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr );
+         toDo -= 4;
+      }
+      if (toDo != 0)
+         VG_(printf)("Approx: do_origins_Dirty(W): missed %d bytes\n",
+                     (Int)toDo );
+      //tl_assert(toDo == 0); /* also need to handle 1,2-byte excess */
+   }
+
+}
+
+static void schemeS ( MCEnv* mce, IRStmt* st )
+{
+   tl_assert(MC_(clo_mc_level) == 3);
+
+   switch (st->tag) {
+
+      case Ist_AbiHint:
+         /* The value-check instrumenter handles this - by arranging
+            to pass the address of the next instruction to
+            MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
+            happen for origin tracking w.r.t. AbiHints.  So there is
+            nothing to do here. */
+         break;
+
+      case Ist_PutI: {
+         IRRegArray* descr_b;
+         IRAtom      *t1, *t2, *t3, *t4;
+         IRRegArray* descr = st->Ist.PutI.descr;
+         IRType equivIntTy
+            = MC_(get_otrack_reg_array_equiv_int_type)(descr);
+         /* If this array is unshadowable for whatever reason,
+            generate no code. */
+         if (equivIntTy == Ity_INVALID)
+            break;
+         tl_assert(sizeofIRType(equivIntTy) >= 4);
+         tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
+         descr_b
+            = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
+                            equivIntTy, descr->nElems );
+         /* Compute a value to Put - the conjoinment of the origin for
+            the data to be Put-ted (obviously) and of the index value
+            (not so obviously). */
+         t1 = schemeE( mce, st->Ist.PutI.data );
+         t2 = schemeE( mce, st->Ist.PutI.ix );
+         t3 = gen_maxU32( mce, t1, t2 );
+         t4 = zWidenFrom32( mce, equivIntTy, t3 );
+         stmt( 'B', mce, IRStmt_PutI( descr_b, st->Ist.PutI.ix,
+                                      st->Ist.PutI.bias, t4 ));
+         break;
+      }
+      case Ist_Dirty:
+         do_origins_Dirty( mce, st->Ist.Dirty.details );
+         break;
+      case Ist_Store: {
+         Int     dszB;
+         IRAtom* dataB;
+         /* assert that the B value for the address is already
+            available (somewhere) */
+         tl_assert(isIRAtom(st->Ist.Store.addr));
+         dszB = sizeofIRType(
+                   typeOfIRExpr(mce->bb->tyenv, st->Ist.Store.data ));
+         dataB = schemeE( mce, st->Ist.Store.data );
+         gen_store_b( mce, dszB, st->Ist.Store.addr, 0/*offset*/, dataB );
+         break;
+      }
+      case Ist_Put: {
+         Int b_offset
+            = MC_(get_otrack_shadow_offset)(
+                 st->Ist.Put.offset,
+                 sizeofIRType(typeOfIRExpr(mce->bb->tyenv, st->Ist.Put.data))
+              );
+         if (b_offset >= 0) {
+            /* FIXME: this isn't an atom! */
+            stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB, 
+                                       schemeE( mce, st->Ist.Put.data )) );
+         }
+         break;
+      }
+      case Ist_WrTmp:
+         assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
+                           schemeE(mce, st->Ist.WrTmp.data) );
+         break;
+      case Ist_MBE:
+      case Ist_NoOp:
+      case Ist_Exit:
+      case Ist_IMark:
+         break;
+      default:
+         VG_(printf)("mc_translate.c: schemeS: unhandled: ");
+         ppIRStmt(st); 
+         VG_(tool_panic)("memcheck:schemeS");
+   }
+}
+
+
 /*--------------------------------------------------------------------*/
 /*--- end                                           mc_translate.c ---*/
 /*--------------------------------------------------------------------*/
diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am
index c212cc3..fe81898 100644
--- a/memcheck/tests/Makefile.am
+++ b/memcheck/tests/Makefile.am
@@ -98,8 +98,21 @@
 	new_override.stderr.exp new_override.stdout.exp new_override.vgtest \
 	noisy_child.vgtest noisy_child.stderr.exp noisy_child.stdout.exp \
 	null_socket.stderr.exp null_socket.vgtest \
-	overlap.stderr.exp overlap.stdout.exp overlap.vgtest \
+	origin1-yes.vgtest origin1-yes.stdout.exp \
+	origin1-yes.stderr.exp \
+	origin2-not-quite.vgtest origin2-not-quite.stdout.exp \
+	origin2-not-quite.stderr.exp \
+	origin3-no.vgtest origin3-no.stdout.exp \
+	origin3-no.stderr.exp \
+	origin4-many.vgtest origin4-many.stdout.exp \
+	origin4-many.stderr.exp-glibc25-x86 \
+	origin4-many.stderr.exp-glibc25-amd64 \
+	origin5-bz2.vgtest origin5-bz2.stdout.exp \
+	origin5-bz2.stderr.exp-glibc25-x86 \
+	origin5-bz2.stderr.exp-glibc25-amd64 \
+	origin5-bz2.stderr.exp-glibc27-ppc64 \
 	oset_test.stderr.exp oset_test.stdout.exp oset_test.vgtest \
+	overlap.stderr.exp overlap.stdout.exp overlap.vgtest \
 	partiallydefinedeq.vgtest partiallydefinedeq.stderr.exp \
 	partiallydefinedeq.stderr.exp2 \
 	partiallydefinedeq.stdout.exp \
@@ -185,7 +198,9 @@
 	memalign_test memalign2 memcmptest mempool mmaptest \
 	nanoleak nanoleak2 new_nothrow \
 	noisy_child \
-	null_socket oset_test overlap \
+	null_socket oset_test \
+	origin1-yes origin2-not-quite origin3-no origin4-many origin5-bz2 \
+	overlap \
 	partiallydefinedeq \
 	partial_load pdb-realloc pdb-realloc2 \
 	pipe pointer-trace \
@@ -232,12 +247,16 @@
 supp2_SOURCES		= supp.c
 # To make it a bit more realistic, have some optimisation enabled
 # for the varinfo tests.  We still expect sane results.
-varinfo1_CFLAGS		= -O -g
-varinfo2_CFLAGS		= -O -g
-varinfo3_CFLAGS		= -O -g
-varinfo4_CFLAGS		= -O -g
-varinfo5_CFLAGS		= -O -g
-varinfo6_CFLAGS		= -O -g
+varinfo1_CFLAGS		= $(AM_FLAG_M3264_PRI) $(AM_CFLAGS) -O -g
+varinfo2_CFLAGS		= $(AM_FLAG_M3264_PRI) $(AM_CFLAGS) -O -g
+varinfo3_CFLAGS		= $(AM_FLAG_M3264_PRI) $(AM_CFLAGS) -O -g
+varinfo4_CFLAGS		= $(AM_FLAG_M3264_PRI) $(AM_CFLAGS) -O -g
+varinfo5_CFLAGS		= $(AM_FLAG_M3264_PRI) $(AM_CFLAGS) -O -g
+varinfo6_CFLAGS		= $(AM_FLAG_M3264_PRI) $(AM_CFLAGS) -O -g
+# This requires optimisation in order to get just one resulting error
+origin4_many_CFLAGS	= $(AM_FLAG_M3264_PRI) $(AM_CFLAGS) -O -g
+# Apply -O so as to run in reasonable time
+origin5_bz2_CFLAGS	= $(AM_FLAG_M3264_PRI) $(AM_CFLAGS) -O -g
 
 # C++ tests
 mismatches_SOURCES	= mismatches.cpp
diff --git a/memcheck/tests/origin1-yes.c b/memcheck/tests/origin1-yes.c
new file mode 100644
index 0000000..b46fa17
--- /dev/null
+++ b/memcheck/tests/origin1-yes.c
@@ -0,0 +1,135 @@
+
+/* This test case was originally written by Nicholas Nethercote. */
+
+// This test covers all the different sources of values, both defined and
+// undefined.  It only involves undefined condition errors.
+//
+// Nb: a stack frame is allocated when a signal is delivered.  But it
+// immediately get written with stuff, so there's no significant possibility
+// of undefined values originating there.  So we ignore it.  (On platforms
+// like AMD64 that have a redzone just beyond the stack pointer there is a
+// possibility, but it's so slim we ignore it.)
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include "../memcheck.h"
+
+int x = 0;
+
+int main(void)
+{
+   assert(1 == sizeof(char));
+   assert(2 == sizeof(short));
+   assert(4 == sizeof(int));
+   assert(8 == sizeof(long long));
+
+   //------------------------------------------------------------------------
+   // Sources of undefined values
+   //------------------------------------------------------------------------
+
+   // Stack, 32-bit
+   {
+      volatile int undef_stack_int;
+      fprintf(stderr, "\nUndef 1 of 8 (stack, 32 bit)\n");
+      x += (undef_stack_int == 0x12345678 ? 10 : 21);
+   }
+   
+   // Stack, 32-bit, recently modified.  Nb: we have to do the register
+   // mucking about to make sure that the modification isn't fenced by a
+   // store/load pair and thus not seen (see origin-not-quite.c).
+   {
+      volatile int undef_stack_int;
+      register int modified_undef_stack_int;
+      fprintf(stderr, "\nUndef 2 of 8 (stack, 32 bit)\n");
+      modified_undef_stack_int = undef_stack_int;
+      modified_undef_stack_int++;
+      x += (modified_undef_stack_int == 0x1234 ? 11 : 22);
+   }
+   
+   // Stack, 64-bit.  XXX: gets reported with two identical origins.
+   {
+      volatile long long undef_stack_longlong;
+      fprintf(stderr, "\nUndef 3 of 8 (stack, 64 bit)\n");
+      x += (undef_stack_longlong == 0x1234567812345678LL ? 11 : 22);
+   }
+   
+   // Malloc block, uninitialised, 32-bit
+   {
+      int* ptr_to_undef_malloc_int = malloc(sizeof(int));
+      int  undef_malloc_int = *ptr_to_undef_malloc_int;
+      fprintf(stderr, "\nUndef 4 of 8 (mallocd, 32-bit)\n");
+      x += (undef_malloc_int == 0x12345678 ? 12 : 23);
+   }
+
+   // Realloc block, uninitialised
+   {
+      int* ptr_to_undef_malloc_int2 = malloc(sizeof(int));
+         // Allocate a big chunk to ensure that a new block is allocated.
+      int* ptr_to_undef_realloc_int = realloc(ptr_to_undef_malloc_int2, 4096);
+         // Have to move past the first 4 bytes, which were copied from the
+         // malloc'd block.
+      int  undef_realloc_int = *(ptr_to_undef_realloc_int+1);
+      fprintf(stderr, "\nUndef 5 of 8 (realloc)\n");
+      x += (undef_realloc_int == 0x12345678 ? 13 : 24);
+   }
+
+   // Custom-allocated block, non-zeroed
+   {
+      int  undef_custom_alloc_int;
+      VALGRIND_MALLOCLIKE_BLOCK(&undef_custom_alloc_int, sizeof(int),
+                                /*rzB*/0, /*is_zeroed*/0);
+      fprintf(stderr, "\nUndef 6 of 8 (MALLOCLIKE_BLOCK)\n");
+      x += (undef_custom_alloc_int == 0x12345678 ? 14 : 25);
+   }
+
+   // Heap segment (brk), uninitialised
+   {
+      int* ptr_to_new_brk_limit = sbrk(4096);
+      int  undef_brk_int = *ptr_to_new_brk_limit;
+      fprintf(stderr, "\nUndef 7 of 8 (brk)\n");
+      x += (undef_brk_int == 0x12345678 ? 15 : 26);
+   }
+
+   // User block, marked as undefined
+   {
+      int  undef_user_int = 0;
+      VALGRIND_MAKE_MEM_UNDEFINED(&undef_user_int, sizeof(int));
+      fprintf(stderr, "\nUndef 8 of 8 (MAKE_MEM_UNDEFINED)\n");
+      x += (undef_user_int == 0x12345678 ? 16 : 27);
+   }
+
+   //------------------------------------------------------------------------
+   // Sources of defined values
+   //------------------------------------------------------------------------
+
+   // Heap block (calloc), initialised
+   {
+      int* ptr_to_def_calloc_int = calloc(1, sizeof(int));
+      int  def_calloc_int = *ptr_to_def_calloc_int;
+      fprintf(stderr, "\nDef 1 of 3\n");
+      x += (def_calloc_int == 0x12345678 ? 17 : 28);
+   }
+
+   // Custom-allocated block, non-zeroed
+   {
+      int  def_custom_alloc_int = 0;
+      fprintf(stderr, "\nDef 2 of 3\n");
+      VALGRIND_MALLOCLIKE_BLOCK(&def_custom_alloc_int, sizeof(int),
+                                /*rzB*/0, /*is_zeroed*/1);
+      x += (def_custom_alloc_int == 0x12345678 ? 18 : 29);
+   }
+
+   // mmap block, initialised
+   {
+      int* ptr_to_def_mmap_int =
+               mmap(0, 4096, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+      int def_mmap_int = *ptr_to_def_mmap_int;
+      fprintf(stderr, "\nDef 3 of 3\n");
+      x += (def_mmap_int == 0x12345678 ? 19 : 30);
+   }
+
+   return x;
+}
diff --git a/memcheck/tests/origin1-yes.stderr.exp b/memcheck/tests/origin1-yes.stderr.exp
new file mode 100644
index 0000000..d400435
--- /dev/null
+++ b/memcheck/tests/origin1-yes.stderr.exp
@@ -0,0 +1,65 @@
+
+Undef 1 of 8 (stack, 32 bit)
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin1-yes.c:37)
+ Uninitialised value was created by a stack allocation
+   at 0x........: main (origin1-yes.c:23)
+
+Undef 2 of 8 (stack, 32 bit)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin1-yes.c:49)
+ Uninitialised value was created by a stack allocation
+   at 0x........: main (origin1-yes.c:23)
+
+Undef 3 of 8 (stack, 64 bit)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin1-yes.c:56)
+ Uninitialised value was created by a stack allocation
+   at 0x........: main (origin1-yes.c:23)
+
+Undef 4 of 8 (mallocd, 32-bit)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin1-yes.c:64)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin1-yes.c:61)
+
+Undef 5 of 8 (realloc)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin1-yes.c:76)
+ Uninitialised value was created by a heap allocation
+   at 0x........: realloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin1-yes.c:71)
+
+Undef 6 of 8 (MALLOCLIKE_BLOCK)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin1-yes.c:85)
+ Uninitialised value was created by a heap allocation
+   at 0x........: main (origin1-yes.c:82)
+
+Undef 7 of 8 (brk)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin1-yes.c:93)
+ Uninitialised value was created
+   at 0x........: brk (in /...libc...)
+   by 0x........: ...
+   by 0x........: main (origin1-yes.c:90)
+
+Undef 8 of 8 (MAKE_MEM_UNDEFINED)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin1-yes.c:101)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin1-yes.c:99)
+
+Def 1 of 3
+
+Def 2 of 3
+
+Def 3 of 3
diff --git a/memcheck/tests/origin1-yes.stdout.exp b/memcheck/tests/origin1-yes.stdout.exp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/memcheck/tests/origin1-yes.stdout.exp
diff --git a/memcheck/tests/origin1-yes.vgtest b/memcheck/tests/origin1-yes.vgtest
new file mode 100644
index 0000000..39cb1d4
--- /dev/null
+++ b/memcheck/tests/origin1-yes.vgtest
@@ -0,0 +1,2 @@
+prog: origin1-yes
+vgopts: -q --track-origins=yes
diff --git a/memcheck/tests/origin2-not-quite.c b/memcheck/tests/origin2-not-quite.c
new file mode 100644
index 0000000..7c7ef67
--- /dev/null
+++ b/memcheck/tests/origin2-not-quite.c
@@ -0,0 +1,49 @@
+
+/* This test case was originally written by Nicholas Nethercote. */
+
+// This test demonstrates some cases that the piggybacking algorithm
+// but conceivably might, with more modifications.  The instrumentation
+// based algorithm handles them ok, though.
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+int x = 0;
+
+typedef long long Long;
+
+int main(void)
+{
+   assert(4 == sizeof(int));
+   assert(8 == sizeof(Long));
+
+   // 64-bit undefined double.
+   {
+      double* ptr_to_undef_double = malloc(sizeof(double));
+      double  undef_double = *ptr_to_undef_double;
+      fprintf(stderr, "\nUndef 1 of 3 (64-bit FP)\n");
+      x += (undef_double < (double)123.45 ? 12 : 23);
+   }
+
+   // 32-bit undefined float.
+   {
+      float* ptr_to_undef_float = malloc(sizeof(float));
+      float undef_float = *ptr_to_undef_float;
+      fprintf(stderr, "\nUndef 2 of 3 (32-bit FP)\n");
+      x += (undef_float < (float)234.56  ? 13 : 24);
+   }
+
+   // Stack, 32-bit, recently modified.
+   // Problem here is that we don't chase backwards through loads and
+   // stores.  Ie. the variable is stored after it's been modified, then
+   // loaded again, so we don't see the unmodified version.
+   {
+      int modified_undef_stack_int;
+      modified_undef_stack_int++;
+      fprintf(stderr, "\nUndef 3 of 3 (int)\n");
+      x += (modified_undef_stack_int == 0x1234 ? 11 : 22);
+   }
+   
+   return x;
+}
diff --git a/memcheck/tests/origin2-not-quite.stderr.exp b/memcheck/tests/origin2-not-quite.stderr.exp
new file mode 100644
index 0000000..3f4e86b
--- /dev/null
+++ b/memcheck/tests/origin2-not-quite.stderr.exp
@@ -0,0 +1,22 @@
+
+Undef 1 of 3 (64-bit FP)
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin2-not-quite.c:26)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin2-not-quite.c:23)
+
+Undef 2 of 3 (32-bit FP)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin2-not-quite.c:34)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin2-not-quite.c:31)
+
+Undef 3 of 3 (int)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin2-not-quite.c:45)
+ Uninitialised value was created by a stack allocation
+   at 0x........: main (origin2-not-quite.c:17)
diff --git a/memcheck/tests/origin2-not-quite.stdout.exp b/memcheck/tests/origin2-not-quite.stdout.exp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/memcheck/tests/origin2-not-quite.stdout.exp
diff --git a/memcheck/tests/origin2-not-quite.vgtest b/memcheck/tests/origin2-not-quite.vgtest
new file mode 100644
index 0000000..eae7c49
--- /dev/null
+++ b/memcheck/tests/origin2-not-quite.vgtest
@@ -0,0 +1,2 @@
+prog: origin2-not-quite
+vgopts: -q --track-origins=yes
diff --git a/memcheck/tests/origin3-no.c b/memcheck/tests/origin3-no.c
new file mode 100644
index 0000000..7bcabd7
--- /dev/null
+++ b/memcheck/tests/origin3-no.c
@@ -0,0 +1,86 @@
+
+/* This test case was originally written by Nicholas Nethercote. */
+
+// This test demonstrates cases the piggybacking algorithm cannot handle,
+// but which are handled ok by the instrumentation based algorithm.
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "../memcheck.h"
+
+int x = 0;
+
+int main(void)
+{
+   assert(4 == sizeof(int));
+
+   // 8-bit undefined value.  When compared it's loaded from memory, so will
+   // never work.
+   {
+      char* ptr_to_undef_char = malloc(sizeof(char));
+      char  undef_char = *ptr_to_undef_char;
+      fprintf(stderr, "\nUndef 1 of 8 (8 bit undef)\n");
+      x += (undef_char == 0x12 ? 11 : 22);
+   }
+
+   // Stack, 8-bit from (recently) 32-bit.  But the load only loads 8-bits
+   // of the value, so it'll never work.
+   {
+      int undef_stack_int;
+      register char undef_stack_char = (char)undef_stack_int;
+      fprintf(stderr, "\nUndef 2 of 8 (8 bits of 32 undef)\n");
+      x += (undef_stack_char == 0x12 ? 11 : 22);
+   }
+
+   // 32-bit undefined value.  This one is identified, and is here for
+   // sanity-checking.
+   {
+      int* ptr_to_undef_int = malloc(sizeof(int));
+      int  undef_int = *ptr_to_undef_int;
+      fprintf(stderr, "\nUndef 3 of 8 (32 bit undef)\n");
+      x += (undef_int == 0x12345678 ? 13 : 24);
+   }
+
+   // Unaligned 32-bit value.
+   {
+      int* ptr_to_undef_int = malloc(sizeof(int) + 1);
+      int  undef_unaligned_int = *(int*)((long)ptr_to_undef_int + 1);
+      fprintf(stderr, "\nUndef 4 of 8 (32 bit undef, unaligned)\n");
+      x += (undef_unaligned_int == 0x12345678 ? 14 : 25);
+   }
+
+   // Modified 32-bit value.
+   {
+      int* ptr_to_undef_int3 = malloc(sizeof(int));
+      int  modified_undef_int = *ptr_to_undef_int3;
+      fprintf(stderr, "\nUndef 5 of 8 (32 bit undef, modified)\n");
+      modified_undef_int++;
+      x += (modified_undef_int == 0x12345678 ? 15 : 26);
+   }
+
+   // Uninitialised 32-bit value (middle of 3) is made undefined in two
+   // unaligned pieces:
+   //   |....|....|....|   three 4-byte integers
+   //    XXXX-YY           first MAKE_MEM_UNDEFINED
+   //           YY-XXXX    second MAKE_MEM_UNDEFINED
+   // Because the YY parts don't get marked (they're not 32-bit and aligned)
+   // the middle byte keeps its original value, which is zero (from calloc).
+   // So even though it's been marked as undefined, it doesn't have an
+   // origin-tracking value and so cannot be identified.  We also check the
+   // first and third ints (which are identified) for sanity-checking.
+   {
+      int* ptr_to_3_undef_ints = calloc(3, sizeof(int));
+      int* ptr_to_middle       = (int*)((long)ptr_to_3_undef_ints + 6);
+      VALGRIND_MAKE_MEM_UNDEFINED(ptr_to_3_undef_ints, 6);
+      VALGRIND_MAKE_MEM_UNDEFINED(ptr_to_middle,       6);
+      fprintf(stderr, "\nUndef 6 of 8 (32 bit undef, unaligned, strange, #1)\n");
+      x += (*(ptr_to_3_undef_ints + 0)  == 0x12345678 ? 16 : 27);
+      fprintf(stderr, "\nUndef 7 of 8 (32 bit undef, unaligned, strange, #2)\n");
+      x += (*(ptr_to_3_undef_ints + 1)  == 0x12345678 ? 17 : 28);
+      fprintf(stderr, "\nUndef 8 of 8 (32 bit undef, unaligned, strange, #3)\n");
+      x += (*(ptr_to_3_undef_ints + 2)  == 0x12345678 ? 18 : 29);
+   }
+
+   return x;
+}
diff --git a/memcheck/tests/origin3-no.stderr.exp b/memcheck/tests/origin3-no.stderr.exp
new file mode 100644
index 0000000..7ef704b
--- /dev/null
+++ b/memcheck/tests/origin3-no.stderr.exp
@@ -0,0 +1,59 @@
+
+Undef 1 of 8 (8 bit undef)
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin3-no.c:24)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin3-no.c:21)
+
+Undef 2 of 8 (8 bits of 32 undef)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin3-no.c:33)
+ Uninitialised value was created by a stack allocation
+   at 0x........: main (origin3-no.c:15)
+
+Undef 3 of 8 (32 bit undef)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin3-no.c:42)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin3-no.c:39)
+
+Undef 4 of 8 (32 bit undef, unaligned)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin3-no.c:50)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin3-no.c:47)
+
+Undef 5 of 8 (32 bit undef, modified)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin3-no.c:59)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin3-no.c:55)
+
+Undef 6 of 8 (32 bit undef, unaligned, strange, #1)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin3-no.c:78)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin3-no.c:75)
+
+Undef 7 of 8 (32 bit undef, unaligned, strange, #2)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin3-no.c:80)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin3-no.c:76)
+
+Undef 8 of 8 (32 bit undef, unaligned, strange, #3)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin3-no.c:82)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin3-no.c:76)
diff --git a/memcheck/tests/origin3-no.stdout.exp b/memcheck/tests/origin3-no.stdout.exp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/memcheck/tests/origin3-no.stdout.exp
diff --git a/memcheck/tests/origin3-no.vgtest b/memcheck/tests/origin3-no.vgtest
new file mode 100644
index 0000000..cdb45d3
--- /dev/null
+++ b/memcheck/tests/origin3-no.vgtest
@@ -0,0 +1,2 @@
+prog: origin3-no
+vgopts: -q --track-origins=yes
diff --git a/memcheck/tests/origin4-many.c b/memcheck/tests/origin4-many.c
new file mode 100644
index 0000000..f8f6ea7
--- /dev/null
+++ b/memcheck/tests/origin4-many.c
@@ -0,0 +1,59 @@
+
+/* This test case was originally written by Nicholas Nethercote. */
+
+// (old comments)
+// This file tests how many possible origins can be tracked for a single
+// error.
+// XXX: other files don't need to do have multiple origins for errors now,
+//      thanks to this test...
+// (end of old comments)
+
+/* When compiled -O, this produces an executable which reports a
+   single uninitialised value error, on the value handed to the exit()
+   system call.  Fair enough.
+
+   An important question is: which of the origins is reported in the
+   error?  Well, considering that (1) m_execontext allocates ECUs
+   (origin tags, basically) in increasing order, and (2) memcheck's
+   instrumentation for dealing with two uninitialised sources simply
+   involves 'max'-ing the otags, we expect the origin to be attributed
+   to the last of the 8 mallocs, that is, to p_ui8.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+static int x = 0;
+
+int main(void)
+{
+   // Do them separately rather than all in one array so they all have
+   // different origins.
+   int* p_ui1 = malloc(sizeof(int));
+   int* p_ui2 = malloc(sizeof(int));
+   int* p_ui3 = malloc(sizeof(int));
+   int* p_ui4 = malloc(sizeof(int));
+   int* p_ui5 = malloc(sizeof(int));
+   int* p_ui6 = malloc(sizeof(int));
+   int* p_ui7 = malloc(sizeof(int));
+   int* p_ui8 = malloc(sizeof(int));
+   int  ui1 = *p_ui1;
+   int  ui2 = *p_ui2;
+   int  ui3 = *p_ui3;
+   int  ui4 = *p_ui4;
+   int  ui5 = *p_ui5;
+   int  ui6 = *p_ui6;
+   int  ui7 = *p_ui7;
+   int  ui8 = *p_ui8;
+
+   x += (ui1                                    == 0x12345678 ? 12 : 23);
+   x += (ui1 +ui2                               == 0x12345678 ? 13 : 24);
+   x += (ui1 +ui2 +ui3                          == 0x12345678 ? 14 : 25);
+   x += (ui1 +ui2 +ui3 +ui4                     == 0x12345678 ? 15 : 26);
+   x += (ui1 +ui2 +ui3 +ui4 +ui5                == 0x12345678 ? 16 : 27);
+   x += (ui1 +ui2 +ui3 +ui4 +ui5 +ui6           == 0x12345678 ? 17 : 28);
+   x += (ui1 +ui2 +ui3 +ui4 +ui5 +ui6 +ui7      == 0x12345678 ? 18 : 29);
+   x += (ui1 +ui2 +ui3 +ui4 +ui5 +ui6 +ui7 +ui8 == 0x12345678 ? 19 : 30);
+
+   return x & 1;
+}
diff --git a/memcheck/tests/origin4-many.stderr.exp-glibc25-amd64 b/memcheck/tests/origin4-many.stderr.exp-glibc25-amd64
new file mode 100644
index 0000000..b9589af
--- /dev/null
+++ b/memcheck/tests/origin4-many.stderr.exp-glibc25-amd64
@@ -0,0 +1,7 @@
+Syscall param exit_group(exit_code) contains uninitialised byte(s)
+   at 0x........: _Exit (in /...libc...)
+   by 0x........: ...
+   by 0x........: (below main) (in /...libc...)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin4-many.c:39)
diff --git a/memcheck/tests/origin4-many.stderr.exp-glibc25-x86 b/memcheck/tests/origin4-many.stderr.exp-glibc25-x86
new file mode 100644
index 0000000..e01aab5
--- /dev/null
+++ b/memcheck/tests/origin4-many.stderr.exp-glibc25-x86
@@ -0,0 +1,6 @@
+Syscall param exit_group(exit_code) contains uninitialised byte(s)
+   at 0x........: _Exit (in /...libc...)
+   by 0x........: (below main) (in /...libc...)
+ Uninitialised value was created by a heap allocation
+   at 0x........: malloc (vg_replace_malloc.c:...)
+   by 0x........: main (origin4-many.c:39)
diff --git a/memcheck/tests/origin4-many.stdout.exp b/memcheck/tests/origin4-many.stdout.exp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/memcheck/tests/origin4-many.stdout.exp
diff --git a/memcheck/tests/origin4-many.vgtest b/memcheck/tests/origin4-many.vgtest
new file mode 100644
index 0000000..18fe9f7
--- /dev/null
+++ b/memcheck/tests/origin4-many.vgtest
@@ -0,0 +1,2 @@
+prog: origin4-many
+vgopts: -q --track-origins=yes
diff --git a/memcheck/tests/origin5-bz2.c b/memcheck/tests/origin5-bz2.c
new file mode 100644
index 0000000..d56cb43
--- /dev/null
+++ b/memcheck/tests/origin5-bz2.c
@@ -0,0 +1,6537 @@
+// This benchmark is basically bzip2 (mashed to be a single file)
+// compressing and decompressing some data.  It tests Valgrind's handling of
+// realistic and "difficult" (ie. lots of branches and memory accesses)
+// integer code.  Execution is spread out over quite a few basic blocks; 
+// --profile-flags indicates that to get to the top 90%th percentile of
+// dynamic BB counts requires considering the top 51 basic blocks
+
+// This program can be used both as part of the performance test
+// suite, in which case we want it to run for quite a while,
+// and as part of the regression (correctness) test suite, in
+// which case we want it to run quickly and be verbose.
+// So it does the latter iff given a command line arg.
+
+// Licensing: the code within is mostly taken from bzip2, which has a BSD
+// license.  There is a little code from VEX, which is licensed under GPLv2
+// And it's all written by Julian Seward.
+
+#define BZ_NO_STDIO
+
+
+/*-------------------------------------------------------------*/
+/*--- Private header file for the library.                  ---*/
+/*---                                       bzlib_private.h ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+--*/
+
+
+#ifndef _BZLIB_PRIVATE_H
+#define _BZLIB_PRIVATE_H
+
+#include <stdlib.h>
+
+#ifndef BZ_NO_STDIO
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#endif
+
+
+/*-------------------------------------------------------------*/
+/*--- Public header file for the library.                   ---*/
+/*---                                               bzlib.h ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+--*/
+
+
+#ifndef _BZLIB_H
+#define _BZLIB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BZ_RUN               0
+#define BZ_FLUSH             1
+#define BZ_FINISH            2
+
+#define BZ_OK                0
+#define BZ_RUN_OK            1
+#define BZ_FLUSH_OK          2
+#define BZ_FINISH_OK         3
+#define BZ_STREAM_END        4
+#define BZ_SEQUENCE_ERROR    (-1)
+#define BZ_PARAM_ERROR       (-2)
+#define BZ_MEM_ERROR         (-3)
+#define BZ_DATA_ERROR        (-4)
+#define BZ_DATA_ERROR_MAGIC  (-5)
+#define BZ_IO_ERROR          (-6)
+#define BZ_UNEXPECTED_EOF    (-7)
+#define BZ_OUTBUFF_FULL      (-8)
+#define BZ_CONFIG_ERROR      (-9)
+
+typedef 
+   struct {
+      char *next_in;
+      unsigned int avail_in;
+      unsigned int total_in_lo32;
+      unsigned int total_in_hi32;
+
+      char *next_out;
+      unsigned int avail_out;
+      unsigned int total_out_lo32;
+      unsigned int total_out_hi32;
+
+      void *state;
+
+      void *(*bzalloc)(void *,int,int);
+      void (*bzfree)(void *,void *);
+      void *opaque;
+   } 
+   bz_stream;
+
+
+#ifndef BZ_IMPORT
+#define BZ_EXPORT
+#endif
+
+#ifndef BZ_NO_STDIO
+/* Need a definitition for FILE */
+#include <stdio.h>
+#endif
+
+#ifdef _WIN32
+#   include <windows.h>
+#   ifdef small
+      /* windows.h define small to char */
+#      undef small
+#   endif
+#   ifdef BZ_EXPORT
+#   define BZ_API(func) WINAPI func
+#   define BZ_EXTERN extern
+#   else
+   /* import windows dll dynamically */
+#   define BZ_API(func) (WINAPI * func)
+#   define BZ_EXTERN
+#   endif
+#else
+#   define BZ_API(func) func
+#   define BZ_EXTERN extern
+#endif
+
+
+/*-- Core (low-level) library functions --*/
+
+BZ_EXTERN int BZ_API(BZ2_bzCompressInit) ( 
+      bz_stream* strm, 
+      int        blockSize100k, 
+      int        verbosity, 
+      int        workFactor 
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzCompress) ( 
+      bz_stream* strm, 
+      int action 
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzCompressEnd) ( 
+      bz_stream* strm 
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzDecompressInit) ( 
+      bz_stream *strm, 
+      int       verbosity, 
+      int       small
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzDecompress) ( 
+      bz_stream* strm 
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzDecompressEnd) ( 
+      bz_stream *strm 
+   );
+
+
+
+/*-- High(er) level library functions --*/
+
+#ifndef BZ_NO_STDIO
+#define BZ_MAX_UNUSED 5000
+
+typedef void BZFILE;
+
+BZ_EXTERN BZFILE* BZ_API(BZ2_bzReadOpen) ( 
+      int*  bzerror,   
+      FILE* f, 
+      int   verbosity, 
+      int   small,
+      void* unused,    
+      int   nUnused 
+   );
+
+BZ_EXTERN void BZ_API(BZ2_bzReadClose) ( 
+      int*    bzerror, 
+      BZFILE* b 
+   );
+
+BZ_EXTERN void BZ_API(BZ2_bzReadGetUnused) ( 
+      int*    bzerror, 
+      BZFILE* b, 
+      void**  unused,  
+      int*    nUnused 
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzRead) ( 
+      int*    bzerror, 
+      BZFILE* b, 
+      void*   buf, 
+      int     len 
+   );
+
+BZ_EXTERN BZFILE* BZ_API(BZ2_bzWriteOpen) ( 
+      int*  bzerror,      
+      FILE* f, 
+      int   blockSize100k, 
+      int   verbosity, 
+      int   workFactor 
+   );
+
+BZ_EXTERN void BZ_API(BZ2_bzWrite) ( 
+      int*    bzerror, 
+      BZFILE* b, 
+      void*   buf, 
+      int     len 
+   );
+
+BZ_EXTERN void BZ_API(BZ2_bzWriteClose) ( 
+      int*          bzerror, 
+      BZFILE*       b, 
+      int           abandon, 
+      unsigned int* nbytes_in, 
+      unsigned int* nbytes_out 
+   );
+
+BZ_EXTERN void BZ_API(BZ2_bzWriteClose64) ( 
+      int*          bzerror, 
+      BZFILE*       b, 
+      int           abandon, 
+      unsigned int* nbytes_in_lo32, 
+      unsigned int* nbytes_in_hi32, 
+      unsigned int* nbytes_out_lo32, 
+      unsigned int* nbytes_out_hi32
+   );
+#endif
+
+
+/*-- Utility functions --*/
+
+BZ_EXTERN int BZ_API(BZ2_bzBuffToBuffCompress) ( 
+      char*         dest, 
+      unsigned int* destLen,
+      char*         source, 
+      unsigned int  sourceLen,
+      int           blockSize100k, 
+      int           verbosity, 
+      int           workFactor 
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzBuffToBuffDecompress) ( 
+      char*         dest, 
+      unsigned int* destLen,
+      char*         source, 
+      unsigned int  sourceLen,
+      int           small, 
+      int           verbosity 
+   );
+
+
+/*--
+   Code contributed by Yoshioka Tsuneo
+   (QWF00133@niftyserve.or.jp/tsuneo-y@is.aist-nara.ac.jp),
+   to support better zlib compatibility.
+   This code is not _officially_ part of libbzip2 (yet);
+   I haven't tested it, documented it, or considered the
+   threading-safeness of it.
+   If this code breaks, please contact both Yoshioka and me.
+--*/
+
+BZ_EXTERN const char * BZ_API(BZ2_bzlibVersion) (
+      void
+   );
+
+#ifndef BZ_NO_STDIO
+BZ_EXTERN BZFILE * BZ_API(BZ2_bzopen) (
+      const char *path,
+      const char *mode
+   );
+
+BZ_EXTERN BZFILE * BZ_API(BZ2_bzdopen) (
+      int        fd,
+      const char *mode
+   );
+         
+BZ_EXTERN int BZ_API(BZ2_bzread) (
+      BZFILE* b, 
+      void* buf, 
+      int len 
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzwrite) (
+      BZFILE* b, 
+      void*   buf, 
+      int     len 
+   );
+
+BZ_EXTERN int BZ_API(BZ2_bzflush) (
+      BZFILE* b
+   );
+
+BZ_EXTERN void BZ_API(BZ2_bzclose) (
+      BZFILE* b
+   );
+
+BZ_EXTERN const char * BZ_API(BZ2_bzerror) (
+      BZFILE *b, 
+      int    *errnum
+   );
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+/*-------------------------------------------------------------*/
+/*--- end                                           bzlib.h ---*/
+/*-------------------------------------------------------------*/
+
+
+
+
+/*-- General stuff. --*/
+
+#define BZ_VERSION  "1.0.3, 17-Oct-2004"
+
+typedef char            Char;
+typedef unsigned char   Bool;
+typedef unsigned char   UChar;
+typedef int             Int32;
+typedef unsigned int    UInt32;
+typedef short           Int16;
+typedef unsigned short  UInt16;
+
+#define True  ((Bool)1)
+#define False ((Bool)0)
+
+#ifndef __GNUC__
+#define __inline__  /* */
+#endif 
+
+#ifndef BZ_NO_STDIO
+extern void BZ2_bz__AssertH__fail ( int errcode );
+#define AssertH(cond,errcode) \
+   { if (!(cond)) BZ2_bz__AssertH__fail ( errcode ); }
+#if BZ_DEBUG
+#define AssertD(cond,msg) \
+   { if (!(cond)) {       \
+      fprintf ( stderr,   \
+        "\n\nlibbzip2(debug build): internal error\n\t%s\n", msg );\
+      exit(1); \
+   }}
+#else
+#define AssertD(cond,msg) /* */
+#endif
+#define VPrintf0(zf) \
+   fprintf(stderr,zf)
+#define VPrintf1(zf,za1) \
+   fprintf(stderr,zf,za1)
+#define VPrintf2(zf,za1,za2) \
+   fprintf(stderr,zf,za1,za2)
+#define VPrintf3(zf,za1,za2,za3) \
+   fprintf(stderr,zf,za1,za2,za3)
+#define VPrintf4(zf,za1,za2,za3,za4) \
+   fprintf(stderr,zf,za1,za2,za3,za4)
+#define VPrintf5(zf,za1,za2,za3,za4,za5) \
+   fprintf(stderr,zf,za1,za2,za3,za4,za5)
+#else
+extern void bz_internal_error ( int errcode );
+#define AssertH(cond,errcode) \
+   { if (!(cond)) bz_internal_error ( errcode ); }
+#define AssertD(cond,msg) /* */
+#define VPrintf0(zf) \
+   vex_printf(zf)
+#define VPrintf1(zf,za1) \
+   vex_printf(zf,za1)
+#define VPrintf2(zf,za1,za2) \
+   vex_printf(zf,za1,za2)
+#define VPrintf3(zf,za1,za2,za3) \
+   vex_printf(zf,za1,za2,za3)
+#define VPrintf4(zf,za1,za2,za3,za4) \
+   vex_printf(zf,za1,za2,za3,za4)
+#define VPrintf5(zf,za1,za2,za3,za4,za5) \
+   vex_printf(zf,za1,za2,za3,za4,za5)
+#endif
+
+
+#define BZALLOC(nnn) (strm->bzalloc)(strm->opaque,(nnn),1)
+#define BZFREE(ppp)  (strm->bzfree)(strm->opaque,(ppp))
+
+
+/*-- Header bytes. --*/
+
+#define BZ_HDR_B 0x42   /* 'B' */
+#define BZ_HDR_Z 0x5a   /* 'Z' */
+#define BZ_HDR_h 0x68   /* 'h' */
+#define BZ_HDR_0 0x30   /* '0' */
+  
+/*-- Constants for the back end. --*/
+
+#define BZ_MAX_ALPHA_SIZE 258
+#define BZ_MAX_CODE_LEN    23
+
+#define BZ_RUNA 0
+#define BZ_RUNB 1
+
+#define BZ_N_GROUPS 6
+#define BZ_G_SIZE   50
+#define BZ_N_ITERS  4
+
+#define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE))
+
+
+
+/*-- Stuff for randomising repetitive blocks. --*/
+
+extern Int32 BZ2_rNums[512];
+
+#define BZ_RAND_DECLS                          \
+   Int32 rNToGo;                               \
+   Int32 rTPos                                 \
+
+#define BZ_RAND_INIT_MASK                      \
+   s->rNToGo = 0;                              \
+   s->rTPos  = 0                               \
+
+#define BZ_RAND_MASK ((s->rNToGo == 1) ? 1 : 0)
+
+#define BZ_RAND_UPD_MASK                       \
+   if (s->rNToGo == 0) {                       \
+      s->rNToGo = BZ2_rNums[s->rTPos];         \
+      s->rTPos++;                              \
+      if (s->rTPos == 512) s->rTPos = 0;       \
+   }                                           \
+   s->rNToGo--;
+
+
+
+/*-- Stuff for doing CRCs. --*/
+
+extern UInt32 BZ2_crc32Table[256];
+
+#define BZ_INITIALISE_CRC(crcVar)              \
+{                                              \
+   crcVar = 0xffffffffL;                       \
+}
+
+#define BZ_FINALISE_CRC(crcVar)                \
+{                                              \
+   crcVar = ~(crcVar);                         \
+}
+
+#define BZ_UPDATE_CRC(crcVar,cha)              \
+{                                              \
+   crcVar = (crcVar << 8) ^                    \
+            BZ2_crc32Table[(crcVar >> 24) ^    \
+                           ((UChar)cha)];      \
+}
+
+
+
+/*-- States and modes for compression. --*/
+
+#define BZ_M_IDLE      1
+#define BZ_M_RUNNING   2
+#define BZ_M_FLUSHING  3
+#define BZ_M_FINISHING 4
+
+#define BZ_S_OUTPUT    1
+#define BZ_S_INPUT     2
+
+#define BZ_N_RADIX 2
+#define BZ_N_QSORT 12
+#define BZ_N_SHELL 18
+#define BZ_N_OVERSHOOT (BZ_N_RADIX + BZ_N_QSORT + BZ_N_SHELL + 2)
+
+
+
+
+/*-- Structure holding all the compression-side stuff. --*/
+
+typedef
+   struct {
+      /* pointer back to the struct bz_stream */
+      bz_stream* strm;
+
+      /* mode this stream is in, and whether inputting */
+      /* or outputting data */
+      Int32    mode;
+      Int32    state;
+
+      /* remembers avail_in when flush/finish requested */
+      UInt32   avail_in_expect;
+
+      /* for doing the block sorting */
+      UInt32*  arr1;
+      UInt32*  arr2;
+      UInt32*  ftab;
+      Int32    origPtr;
+
+      /* aliases for arr1 and arr2 */
+      UInt32*  ptr;
+      UChar*   block;
+      UInt16*  mtfv;
+      UChar*   zbits;
+
+      /* for deciding when to use the fallback sorting algorithm */
+      Int32    workFactor;
+
+      /* run-length-encoding of the input */
+      UInt32   state_in_ch;
+      Int32    state_in_len;
+      BZ_RAND_DECLS;
+
+      /* input and output limits and current posns */
+      Int32    nblock;
+      Int32    nblockMAX;
+      Int32    numZ;
+      Int32    state_out_pos;
+
+      /* map of bytes used in block */
+      Int32    nInUse;
+      Bool     inUse[256];
+      UChar    unseqToSeq[256];
+
+      /* the buffer for bit stream creation */
+      UInt32   bsBuff;
+      Int32    bsLive;
+
+      /* block and combined CRCs */
+      UInt32   blockCRC;
+      UInt32   combinedCRC;
+
+      /* misc administratium */
+      Int32    verbosity;
+      Int32    blockNo;
+      Int32    blockSize100k;
+
+      /* stuff for coding the MTF values */
+      Int32    nMTF;
+      Int32    mtfFreq    [BZ_MAX_ALPHA_SIZE];
+      UChar    selector   [BZ_MAX_SELECTORS];
+      UChar    selectorMtf[BZ_MAX_SELECTORS];
+
+      UChar    len     [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+      Int32    code    [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+      Int32    rfreq   [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+      /* second dimension: only 3 needed; 4 makes index calculations faster */
+      UInt32   len_pack[BZ_MAX_ALPHA_SIZE][4];
+
+   }
+   EState;
+
+
+
+/*-- externs for compression. --*/
+
+extern void 
+BZ2_blockSort ( EState* );
+
+extern void 
+BZ2_compressBlock ( EState*, Bool );
+
+extern void 
+BZ2_bsInitWrite ( EState* );
+
+extern void 
+BZ2_hbAssignCodes ( Int32*, UChar*, Int32, Int32, Int32 );
+
+extern void 
+BZ2_hbMakeCodeLengths ( UChar*, Int32*, Int32, Int32 );
+
+
+
+/*-- states for decompression. --*/
+
+#define BZ_X_IDLE        1
+#define BZ_X_OUTPUT      2
+
+#define BZ_X_MAGIC_1     10
+#define BZ_X_MAGIC_2     11
+#define BZ_X_MAGIC_3     12
+#define BZ_X_MAGIC_4     13
+#define BZ_X_BLKHDR_1    14
+#define BZ_X_BLKHDR_2    15
+#define BZ_X_BLKHDR_3    16
+#define BZ_X_BLKHDR_4    17
+#define BZ_X_BLKHDR_5    18
+#define BZ_X_BLKHDR_6    19
+#define BZ_X_BCRC_1      20
+#define BZ_X_BCRC_2      21
+#define BZ_X_BCRC_3      22
+#define BZ_X_BCRC_4      23
+#define BZ_X_RANDBIT     24
+#define BZ_X_ORIGPTR_1   25
+#define BZ_X_ORIGPTR_2   26
+#define BZ_X_ORIGPTR_3   27
+#define BZ_X_MAPPING_1   28
+#define BZ_X_MAPPING_2   29
+#define BZ_X_SELECTOR_1  30
+#define BZ_X_SELECTOR_2  31
+#define BZ_X_SELECTOR_3  32
+#define BZ_X_CODING_1    33
+#define BZ_X_CODING_2    34
+#define BZ_X_CODING_3    35
+#define BZ_X_MTF_1       36
+#define BZ_X_MTF_2       37
+#define BZ_X_MTF_3       38
+#define BZ_X_MTF_4       39
+#define BZ_X_MTF_5       40
+#define BZ_X_MTF_6       41
+#define BZ_X_ENDHDR_2    42
+#define BZ_X_ENDHDR_3    43
+#define BZ_X_ENDHDR_4    44
+#define BZ_X_ENDHDR_5    45
+#define BZ_X_ENDHDR_6    46
+#define BZ_X_CCRC_1      47
+#define BZ_X_CCRC_2      48
+#define BZ_X_CCRC_3      49
+#define BZ_X_CCRC_4      50
+
+
+
+/*-- Constants for the fast MTF decoder. --*/
+
+#define MTFA_SIZE 4096
+#define MTFL_SIZE 16
+
+
+
+/*-- Structure holding all the decompression-side stuff. --*/
+
+typedef
+   struct {
+      /* pointer back to the struct bz_stream */
+      bz_stream* strm;
+
+      /* state indicator for this stream */
+      Int32    state;
+
+      /* for doing the final run-length decoding */
+      UChar    state_out_ch;
+      Int32    state_out_len;
+      Bool     blockRandomised;
+      BZ_RAND_DECLS;
+
+      /* the buffer for bit stream reading */
+      UInt32   bsBuff;
+      Int32    bsLive;
+
+      /* misc administratium */
+      Int32    blockSize100k;
+      Bool     smallDecompress;
+      Int32    currBlockNo;
+      Int32    verbosity;
+
+      /* for undoing the Burrows-Wheeler transform */
+      Int32    origPtr;
+      UInt32   tPos;
+      Int32    k0;
+      Int32    unzftab[256];
+      Int32    nblock_used;
+      Int32    cftab[257];
+      Int32    cftabCopy[257];
+
+      /* for undoing the Burrows-Wheeler transform (FAST) */
+      UInt32   *tt;
+
+      /* for undoing the Burrows-Wheeler transform (SMALL) */
+      UInt16   *ll16;
+      UChar    *ll4;
+
+      /* stored and calculated CRCs */
+      UInt32   storedBlockCRC;
+      UInt32   storedCombinedCRC;
+      UInt32   calculatedBlockCRC;
+      UInt32   calculatedCombinedCRC;
+
+      /* map of bytes used in block */
+      Int32    nInUse;
+      Bool     inUse[256];
+      Bool     inUse16[16];
+      UChar    seqToUnseq[256];
+
+      /* for decoding the MTF values */
+      UChar    mtfa   [MTFA_SIZE];
+      Int32    mtfbase[256 / MTFL_SIZE];
+      UChar    selector   [BZ_MAX_SELECTORS];
+      UChar    selectorMtf[BZ_MAX_SELECTORS];
+      UChar    len  [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+
+      Int32    limit  [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+      Int32    base   [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+      Int32    perm   [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+      Int32    minLens[BZ_N_GROUPS];
+
+      /* save area for scalars in the main decompress code */
+      Int32    save_i;
+      Int32    save_j;
+      Int32    save_t;
+      Int32    save_alphaSize;
+      Int32    save_nGroups;
+      Int32    save_nSelectors;
+      Int32    save_EOB;
+      Int32    save_groupNo;
+      Int32    save_groupPos;
+      Int32    save_nextSym;
+      Int32    save_nblockMAX;
+      Int32    save_nblock;
+      Int32    save_es;
+      Int32    save_N;
+      Int32    save_curr;
+      Int32    save_zt;
+      Int32    save_zn; 
+      Int32    save_zvec;
+      Int32    save_zj;
+      Int32    save_gSel;
+      Int32    save_gMinlen;
+      Int32*   save_gLimit;
+      Int32*   save_gBase;
+      Int32*   save_gPerm;
+
+   }
+   DState;
+
+
+
+/*-- Macros for decompression. --*/
+
+#define BZ_GET_FAST(cccc)                     \
+    s->tPos = s->tt[s->tPos];                 \
+    cccc = (UChar)(s->tPos & 0xff);           \
+    s->tPos >>= 8;
+
+#define BZ_GET_FAST_C(cccc)                   \
+    c_tPos = c_tt[c_tPos];                    \
+    cccc = (UChar)(c_tPos & 0xff);            \
+    c_tPos >>= 8;
+
+#define SET_LL4(i,n)                                          \
+   { if (((i) & 0x1) == 0)                                    \
+        s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0xf0) | (n); else    \
+        s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0x0f) | ((n) << 4);  \
+   }
+
+#define GET_LL4(i)                             \
+   ((((UInt32)(s->ll4[(i) >> 1])) >> (((i) << 2) & 0x4)) & 0xF)
+
+#define SET_LL(i,n)                          \
+   { s->ll16[i] = (UInt16)(n & 0x0000ffff);  \
+     SET_LL4(i, n >> 16);                    \
+   }
+
+#define GET_LL(i) \
+   (((UInt32)s->ll16[i]) | (GET_LL4(i) << 16))
+
+#define BZ_GET_SMALL(cccc)                            \
+      cccc = BZ2_indexIntoF ( s->tPos, s->cftab );    \
+      s->tPos = GET_LL(s->tPos);
+
+
+/*-- externs for decompression. --*/
+
+extern Int32 
+BZ2_indexIntoF ( Int32, Int32* );
+
+extern Int32 
+BZ2_decompress ( DState* );
+
+extern void 
+BZ2_hbCreateDecodeTables ( Int32*, Int32*, Int32*, UChar*,
+                           Int32,  Int32, Int32 );
+
+
+#endif
+
+
+/*-- BZ_NO_STDIO seems to make NULL disappear on some platforms. --*/
+
+#ifdef BZ_NO_STDIO
+#ifndef NULL
+#define NULL 0
+#endif
+#endif
+
+
+/*-------------------------------------------------------------*/
+/*--- end                                   bzlib_private.h ---*/
+/*-------------------------------------------------------------*/
+
+
+/* Something which has the same size as void* on the host.  That is,
+   it is 32 bits on a 32-bit host and 64 bits on a 64-bit host, and so
+   it can safely be coerced to and from a pointer type on the host
+   machine. */
+typedef  unsigned long HWord;
+typedef  char          HChar;
+typedef  signed int    Int;
+typedef  unsigned int  UInt;
+
+typedef    signed long long int   Long;
+typedef  unsigned long long int   ULong;
+
+
+/////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////
+
+static HWord (*serviceFn)(HWord,HWord) = 0;
+
+#if 0
+static char* my_strcpy ( char* dest, const char* src )
+{
+   char* dest_orig = dest;
+   while (*src) *dest++ = *src++;
+   *dest = 0;
+   return dest_orig;
+}
+
+static void* my_memcpy ( void *dest, const void *src, int sz )
+{
+   const char *s = (const char *)src;
+   char *d = (char *)dest;
+
+   while (sz--)
+      *d++ = *s++;
+
+   return dest;
+}
+
+static void* my_memmove( void *dst, const void *src, unsigned int len )
+{
+    register char *d;
+    register char *s;
+    if ( dst > src ) {
+        d = (char *)dst + len - 1;
+        s = (char *)src + len - 1;
+        while ( len >= 4 ) {
+            *d-- = *s--;
+            *d-- = *s--;
+            *d-- = *s--;
+            *d-- = *s--;
+            len -= 4;
+        }
+        while ( len-- ) {
+            *d-- = *s--;
+        }
+    } else if ( dst < src ) {
+        d = (char *)dst;
+        s = (char *)src;
+        while ( len >= 4 ) {
+            *d++ = *s++;
+            *d++ = *s++;
+            *d++ = *s++;
+            *d++ = *s++;
+            len -= 4;
+        }
+        while ( len-- ) {
+            *d++ = *s++;
+        }
+    }
+    return dst;
+}
+#endif
+
+char* my_strcat ( char* dest, const char* src )
+{
+   char* dest_orig = dest;
+   while (*dest) dest++;
+   while (*src) *dest++ = *src++;
+   *dest = 0;
+   return dest_orig;
+}
+
+
+/////////////////////////////////////////////////////////////////////
+
+static void vex_log_bytes ( char* p, int n )
+{
+   int i;
+   for (i = 0; i < n; i++)
+      (*serviceFn)( 1, (int)p[i] );
+}
+
+/*---------------------------------------------------------*/
+/*--- vex_printf                                        ---*/
+/*---------------------------------------------------------*/
+
+/* This should be the only <...> include in the entire VEX library.
+   New code for vex_util.c should go above this point. */
+#include <stdarg.h>
+
+static HChar vex_toupper ( HChar c )
+{
+   if (c >= 'a' && c <= 'z')
+      return c + ('A' - 'a');
+   else
+      return c;
+}
+
+static Int vex_strlen ( const HChar* str )
+{
+   Int i = 0;
+   while (str[i] != 0) i++;
+   return i;
+}
+
+Bool vex_streq ( const HChar* s1, const HChar* s2 )
+{
+   while (True) {
+      if (*s1 == 0 && *s2 == 0)
+         return True;
+      if (*s1 != *s2)
+         return False;
+      s1++;
+      s2++;
+   }
+}
+
+/* Some flags.  */
+#define VG_MSG_SIGNED    1 /* The value is signed. */
+#define VG_MSG_ZJUSTIFY  2 /* Must justify with '0'. */
+#define VG_MSG_LJUSTIFY  4 /* Must justify on the left. */
+#define VG_MSG_PAREN     8 /* Parenthesize if present (for %y) */
+#define VG_MSG_COMMA    16 /* Add commas to numbers (for %d, %u) */
+
+/* Copy a string into the buffer. */
+static UInt
+myvprintf_str ( void(*send)(HChar), Int flags, Int width, HChar* str, 
+                Bool capitalise )
+{
+#  define MAYBE_TOUPPER(ch) (capitalise ? vex_toupper(ch) : (ch))
+   UInt ret = 0;
+   Int i, extra;
+   Int len = vex_strlen(str);
+
+   if (width == 0) {
+      ret += len;
+      for (i = 0; i < len; i++)
+         send(MAYBE_TOUPPER(str[i]));
+      return ret;
+   }
+
+   if (len > width) {
+      ret += width;
+      for (i = 0; i < width; i++)
+         send(MAYBE_TOUPPER(str[i]));
+      return ret;
+   }
+
+   extra = width - len;
+   if (flags & VG_MSG_LJUSTIFY) {
+      ret += extra;
+      for (i = 0; i < extra; i++)
+         send(' ');
+   }
+   ret += len;
+   for (i = 0; i < len; i++)
+      send(MAYBE_TOUPPER(str[i]));
+   if (!(flags & VG_MSG_LJUSTIFY)) {
+      ret += extra;
+      for (i = 0; i < extra; i++)
+         send(' ');
+   }
+
+#  undef MAYBE_TOUPPER
+
+   return ret;
+}
+
+/* Write P into the buffer according to these args:
+ *  If SIGN is true, p is a signed.
+ *  BASE is the base.
+ *  If WITH_ZERO is true, '0' must be added.
+ *  WIDTH is the width of the field.
+ */
+static UInt
+myvprintf_int64 ( void(*send)(HChar), Int flags, Int base, Int width, ULong pL)
+{
+   HChar buf[40];
+   Int   ind = 0;
+   Int   i, nc = 0;
+   Bool  neg = False;
+   HChar *digits = "0123456789ABCDEF";
+   UInt  ret = 0;
+   UInt  p = (UInt)pL;
+
+   if (base < 2 || base > 16)
+      return ret;
+ 
+   if ((flags & VG_MSG_SIGNED) && (Int)p < 0) {
+      p   = - (Int)p;
+      neg = True;
+   }
+
+   if (p == 0)
+      buf[ind++] = '0';
+   else {
+      while (p > 0) {
+         if ((flags & VG_MSG_COMMA) && 10 == base &&
+             0 == (ind-nc) % 3 && 0 != ind) 
+         {
+            buf[ind++] = ',';
+            nc++;
+         }
+         buf[ind++] = digits[p % base];
+         p /= base;
+      }
+   }
+
+   if (neg)
+      buf[ind++] = '-';
+
+   if (width > 0 && !(flags & VG_MSG_LJUSTIFY)) {
+      for(; ind < width; ind++) {
+	//vassert(ind < 39);
+         buf[ind] = ((flags & VG_MSG_ZJUSTIFY) ? '0': ' ');
+      }
+   }
+
+   /* Reverse copy to buffer.  */
+   ret += ind;
+   for (i = ind -1; i >= 0; i--) {
+      send(buf[i]);
+   }
+   if (width > 0 && (flags & VG_MSG_LJUSTIFY)) {
+      for(; ind < width; ind++) {
+	 ret++;
+         send(' ');  // Never pad with zeroes on RHS -- changes the value!
+      }
+   }
+   return ret;
+}
+
+
+/* A simple vprintf().  */
+static 
+UInt vprintf_wrk ( void(*send)(HChar), const HChar *format, va_list vargs )
+{
+   UInt ret = 0;
+   int i;
+   int flags;
+   int width;
+   Bool is_long;
+
+   /* We assume that vargs has already been initialised by the 
+      caller, using va_start, and that the caller will similarly
+      clean up with va_end.
+   */
+
+   for (i = 0; format[i] != 0; i++) {
+      if (format[i] != '%') {
+         send(format[i]);
+	 ret++;
+         continue;
+      }
+      i++;
+      /* A '%' has been found.  Ignore a trailing %. */
+      if (format[i] == 0)
+         break;
+      if (format[i] == '%') {
+         /* `%%' is replaced by `%'. */
+         send('%');
+	 ret++;
+         continue;
+      }
+      flags = 0;
+      is_long = False;
+      width = 0; /* length of the field. */
+      if (format[i] == '(') {
+	 flags |= VG_MSG_PAREN;
+	 i++;
+      }
+      /* If ',' follows '%', commas will be inserted. */
+      if (format[i] == ',') {
+         flags |= VG_MSG_COMMA;
+         i++;
+      }
+      /* If '-' follows '%', justify on the left. */
+      if (format[i] == '-') {
+         flags |= VG_MSG_LJUSTIFY;
+         i++;
+      }
+      /* If '0' follows '%', pads will be inserted. */
+      if (format[i] == '0') {
+         flags |= VG_MSG_ZJUSTIFY;
+         i++;
+      }
+      /* Compute the field length. */
+      while (format[i] >= '0' && format[i] <= '9') {
+         width *= 10;
+         width += format[i++] - '0';
+      }
+      while (format[i] == 'l') {
+         i++;
+         is_long = True;
+      }
+
+      switch (format[i]) {
+         case 'd': /* %d */
+            flags |= VG_MSG_SIGNED;
+            if (is_long)
+               ret += myvprintf_int64(send, flags, 10, width, 
+				      (ULong)(va_arg (vargs, Long)));
+            else
+               ret += myvprintf_int64(send, flags, 10, width, 
+				      (ULong)(va_arg (vargs, Int)));
+            break;
+         case 'u': /* %u */
+            if (is_long)
+               ret += myvprintf_int64(send, flags, 10, width, 
+				      (ULong)(va_arg (vargs, ULong)));
+            else
+               ret += myvprintf_int64(send, flags, 10, width, 
+				      (ULong)(va_arg (vargs, UInt)));
+            break;
+         case 'p': /* %p */
+	    ret += 2;
+            send('0');
+            send('x');
+            ret += myvprintf_int64(send, flags, 16, width, 
+				   (ULong)((HWord)va_arg (vargs, void *)));
+            break;
+         case 'x': /* %x */
+            if (is_long)
+               ret += myvprintf_int64(send, flags, 16, width, 
+				      (ULong)(va_arg (vargs, ULong)));
+            else
+               ret += myvprintf_int64(send, flags, 16, width, 
+				      (ULong)(va_arg (vargs, UInt)));
+            break;
+         case 'c': /* %c */
+	    ret++;
+            send((va_arg (vargs, int)));
+            break;
+         case 's': case 'S': { /* %s */
+            char *str = va_arg (vargs, char *);
+            if (str == (char*) 0) str = "(null)";
+            ret += myvprintf_str(send, flags, width, str, 
+                                 (format[i]=='S'));
+            break;
+	 }
+#        if 0
+	 case 'y': { /* %y - print symbol */
+	    Char buf[100];
+	    Char *cp = buf;
+	    Addr a = va_arg(vargs, Addr);
+
+	    if (flags & VG_MSG_PAREN)
+	       *cp++ = '(';
+	    if (VG_(get_fnname_w_offset)(a, cp, sizeof(buf)-4)) {
+	       if (flags & VG_MSG_PAREN) {
+		  cp += VG_(strlen)(cp);
+		  *cp++ = ')';
+		  *cp = '\0';
+	       }
+	       ret += myvprintf_str(send, flags, width, buf, 0);
+	    }
+	    break;
+	 }
+#        endif
+         default:
+            break;
+      }
+   }
+   return ret;
+}
+
+
+/* A general replacement for printf().  Note that only low-level 
+   debugging info should be sent via here.  The official route is to
+   to use vg_message().  This interface is deprecated.
+*/
+static HChar myprintf_buf[1000];
+static Int   n_myprintf_buf;
+
+static void add_to_myprintf_buf ( HChar c )
+{
+   if (c == '\n' || n_myprintf_buf >= 1000-10 /*paranoia*/ ) {
+      (*vex_log_bytes)( myprintf_buf, vex_strlen(myprintf_buf) );
+      n_myprintf_buf = 0;
+      myprintf_buf[n_myprintf_buf] = 0;      
+   }
+   myprintf_buf[n_myprintf_buf++] = c;
+   myprintf_buf[n_myprintf_buf] = 0;
+}
+
+static UInt vex_printf ( const char *format, ... )
+{
+   UInt ret;
+   va_list vargs;
+   va_start(vargs,format);
+   
+   n_myprintf_buf = 0;
+   myprintf_buf[n_myprintf_buf] = 0;      
+   ret = vprintf_wrk ( add_to_myprintf_buf, format, vargs );
+
+   if (n_myprintf_buf > 0) {
+      (*vex_log_bytes)( myprintf_buf, n_myprintf_buf );
+   }
+
+   va_end(vargs);
+
+   return ret;
+}
+
+/*---------------------------------------------------------------*/
+/*--- end                                          vex_util.c ---*/
+/*---------------------------------------------------------------*/
+
+
+/////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////
+
+
+/*-------------------------------------------------------------*/
+/*--- Decompression machinery                               ---*/
+/*---                                          decompress.c ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+--*/
+
+
+
+
+/*---------------------------------------------------*/
+static
+void makeMaps_d ( DState* s )
+{
+   Int32 i;
+   s->nInUse = 0;
+   for (i = 0; i < 256; i++)
+      if (s->inUse[i]) {
+         s->seqToUnseq[s->nInUse] = i;
+         s->nInUse++;
+      }
+}
+
+
+/*---------------------------------------------------*/
+#define RETURN(rrr)                               \
+   { retVal = rrr; goto save_state_and_return; };
+
+#define GET_BITS(lll,vvv,nnn)                     \
+   case lll: s->state = lll;                      \
+   while (True) {                                 \
+      if (s->bsLive >= nnn) {                     \
+         UInt32 v;                                \
+         v = (s->bsBuff >>                        \
+             (s->bsLive-nnn)) & ((1 << nnn)-1);   \
+         s->bsLive -= nnn;                        \
+         vvv = v;                                 \
+         break;                                   \
+      }                                           \
+      if (s->strm->avail_in == 0) RETURN(BZ_OK);  \
+      s->bsBuff                                   \
+         = (s->bsBuff << 8) |                     \
+           ((UInt32)                              \
+              (*((UChar*)(s->strm->next_in))));   \
+      s->bsLive += 8;                             \
+      s->strm->next_in++;                         \
+      s->strm->avail_in--;                        \
+      s->strm->total_in_lo32++;                   \
+      if (s->strm->total_in_lo32 == 0)            \
+         s->strm->total_in_hi32++;                \
+   }
+
+#define GET_UCHAR(lll,uuu)                        \
+   GET_BITS(lll,uuu,8)
+
+#define GET_BIT(lll,uuu)                          \
+   GET_BITS(lll,uuu,1)
+
+/*---------------------------------------------------*/
+#define GET_MTF_VAL(label1,label2,lval)           \
+{                                                 \
+   if (groupPos == 0) {                           \
+      groupNo++;                                  \
+      if (groupNo >= nSelectors)                  \
+         RETURN(BZ_DATA_ERROR);                   \
+      groupPos = BZ_G_SIZE;                       \
+      gSel = s->selector[groupNo];                \
+      gMinlen = s->minLens[gSel];                 \
+      gLimit = &(s->limit[gSel][0]);              \
+      gPerm = &(s->perm[gSel][0]);                \
+      gBase = &(s->base[gSel][0]);                \
+   }                                              \
+   groupPos--;                                    \
+   zn = gMinlen;                                  \
+   GET_BITS(label1, zvec, zn);                    \
+   while (1) {                                    \
+      if (zn > 20 /* the longest code */)         \
+         RETURN(BZ_DATA_ERROR);                   \
+      if (zvec <= gLimit[zn]) break;              \
+      zn++;                                       \
+      GET_BIT(label2, zj);                        \
+      zvec = (zvec << 1) | zj;                    \
+   };                                             \
+   if (zvec - gBase[zn] < 0                       \
+       || zvec - gBase[zn] >= BZ_MAX_ALPHA_SIZE)  \
+      RETURN(BZ_DATA_ERROR);                      \
+   lval = gPerm[zvec - gBase[zn]];                \
+}
+
+
+
+/*---------------------------------------------------*/
+__inline__ Int32 BZ2_indexIntoF ( Int32 indx, Int32 *cftab )
+{
+   Int32 nb, na, mid;
+   nb = 0;
+   na = 256;
+   do {
+      mid = (nb + na) >> 1;
+      if (indx >= cftab[mid]) nb = mid; else na = mid;
+   }
+   while (na - nb != 1);
+   return nb;
+}
+
+/*---------------------------------------------------*/
+Int32 BZ2_decompress ( DState* s )
+{
+   UChar      uc;
+   Int32      retVal;
+   Int32      minLen, maxLen;
+   bz_stream* strm = s->strm;
+
+   /* stuff that needs to be saved/restored */
+   Int32  i;
+   Int32  j;
+   Int32  t;
+   Int32  alphaSize;
+   Int32  nGroups;
+   Int32  nSelectors;
+   Int32  EOB;
+   Int32  groupNo;
+   Int32  groupPos;
+   Int32  nextSym;
+   Int32  nblockMAX;
+   Int32  nblock;
+   Int32  es;
+   Int32  N;
+   Int32  curr;
+   Int32  zt;
+   Int32  zn; 
+   Int32  zvec;
+   Int32  zj;
+   Int32  gSel;
+   Int32  gMinlen;
+   Int32* gLimit;
+   Int32* gBase;
+   Int32* gPerm;
+
+   if (s->state == BZ_X_MAGIC_1) {
+      /*initialise the save area*/
+      s->save_i           = 0;
+      s->save_j           = 0;
+      s->save_t           = 0;
+      s->save_alphaSize   = 0;
+      s->save_nGroups     = 0;
+      s->save_nSelectors  = 0;
+      s->save_EOB         = 0;
+      s->save_groupNo     = 0;
+      s->save_groupPos    = 0;
+      s->save_nextSym     = 0;
+      s->save_nblockMAX   = 0;
+      s->save_nblock      = 0;
+      s->save_es          = 0;
+      s->save_N           = 0;
+      s->save_curr        = 0;
+      s->save_zt          = 0;
+      s->save_zn          = 0;
+      s->save_zvec        = 0;
+      s->save_zj          = 0;
+      s->save_gSel        = 0;
+      s->save_gMinlen     = 0;
+      s->save_gLimit      = NULL;
+      s->save_gBase       = NULL;
+      s->save_gPerm       = NULL;
+   }
+
+   /*restore from the save area*/
+   i           = s->save_i;
+   j           = s->save_j;
+   t           = s->save_t;
+   alphaSize   = s->save_alphaSize;
+   nGroups     = s->save_nGroups;
+   nSelectors  = s->save_nSelectors;
+   EOB         = s->save_EOB;
+   groupNo     = s->save_groupNo;
+   groupPos    = s->save_groupPos;
+   nextSym     = s->save_nextSym;
+   nblockMAX   = s->save_nblockMAX;
+   nblock      = s->save_nblock;
+   es          = s->save_es;
+   N           = s->save_N;
+   curr        = s->save_curr;
+   zt          = s->save_zt;
+   zn          = s->save_zn; 
+   zvec        = s->save_zvec;
+   zj          = s->save_zj;
+   gSel        = s->save_gSel;
+   gMinlen     = s->save_gMinlen;
+   gLimit      = s->save_gLimit;
+   gBase       = s->save_gBase;
+   gPerm       = s->save_gPerm;
+
+   retVal = BZ_OK;
+
+   switch (s->state) {
+
+      GET_UCHAR(BZ_X_MAGIC_1, uc);
+      if (uc != BZ_HDR_B) RETURN(BZ_DATA_ERROR_MAGIC);
+
+      GET_UCHAR(BZ_X_MAGIC_2, uc);
+      if (uc != BZ_HDR_Z) RETURN(BZ_DATA_ERROR_MAGIC);
+
+      GET_UCHAR(BZ_X_MAGIC_3, uc)
+      if (uc != BZ_HDR_h) RETURN(BZ_DATA_ERROR_MAGIC);
+
+      GET_BITS(BZ_X_MAGIC_4, s->blockSize100k, 8)
+      if (s->blockSize100k < (BZ_HDR_0 + 1) || 
+          s->blockSize100k > (BZ_HDR_0 + 9)) RETURN(BZ_DATA_ERROR_MAGIC);
+      s->blockSize100k -= BZ_HDR_0;
+
+      if (s->smallDecompress) {
+         s->ll16 = BZALLOC( s->blockSize100k * 100000 * sizeof(UInt16) );
+         s->ll4  = BZALLOC( 
+                      ((1 + s->blockSize100k * 100000) >> 1) * sizeof(UChar) 
+                   );
+         if (s->ll16 == NULL || s->ll4 == NULL) RETURN(BZ_MEM_ERROR);
+      } else {
+         s->tt  = BZALLOC( s->blockSize100k * 100000 * sizeof(Int32) );
+         if (s->tt == NULL) RETURN(BZ_MEM_ERROR);
+      }
+
+      GET_UCHAR(BZ_X_BLKHDR_1, uc);
+
+      if (uc == 0x17) goto endhdr_2;
+      if (uc != 0x31) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_BLKHDR_2, uc);
+      if (uc != 0x41) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_BLKHDR_3, uc);
+      if (uc != 0x59) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_BLKHDR_4, uc);
+      if (uc != 0x26) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_BLKHDR_5, uc);
+      if (uc != 0x53) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_BLKHDR_6, uc);
+      if (uc != 0x59) RETURN(BZ_DATA_ERROR);
+
+      s->currBlockNo++;
+      if (s->verbosity >= 2)
+         VPrintf1 ( "\n    [%d: huff+mtf ", s->currBlockNo );
+ 
+      s->storedBlockCRC = 0;
+      GET_UCHAR(BZ_X_BCRC_1, uc);
+      s->storedBlockCRC = (s->storedBlockCRC << 8) | ((UInt32)uc);
+      GET_UCHAR(BZ_X_BCRC_2, uc);
+      s->storedBlockCRC = (s->storedBlockCRC << 8) | ((UInt32)uc);
+      GET_UCHAR(BZ_X_BCRC_3, uc);
+      s->storedBlockCRC = (s->storedBlockCRC << 8) | ((UInt32)uc);
+      GET_UCHAR(BZ_X_BCRC_4, uc);
+      s->storedBlockCRC = (s->storedBlockCRC << 8) | ((UInt32)uc);
+
+      GET_BITS(BZ_X_RANDBIT, s->blockRandomised, 1);
+
+      s->origPtr = 0;
+      GET_UCHAR(BZ_X_ORIGPTR_1, uc);
+      s->origPtr = (s->origPtr << 8) | ((Int32)uc);
+      GET_UCHAR(BZ_X_ORIGPTR_2, uc);
+      s->origPtr = (s->origPtr << 8) | ((Int32)uc);
+      GET_UCHAR(BZ_X_ORIGPTR_3, uc);
+      s->origPtr = (s->origPtr << 8) | ((Int32)uc);
+
+      if (s->origPtr < 0)
+         RETURN(BZ_DATA_ERROR);
+      if (s->origPtr > 10 + 100000*s->blockSize100k) 
+         RETURN(BZ_DATA_ERROR);
+
+      /*--- Receive the mapping table ---*/
+      for (i = 0; i < 16; i++) {
+         GET_BIT(BZ_X_MAPPING_1, uc);
+         if (uc == 1) 
+            s->inUse16[i] = True; else 
+            s->inUse16[i] = False;
+      }
+
+      for (i = 0; i < 256; i++) s->inUse[i] = False;
+
+      for (i = 0; i < 16; i++)
+         if (s->inUse16[i])
+            for (j = 0; j < 16; j++) {
+               GET_BIT(BZ_X_MAPPING_2, uc);
+               if (uc == 1) s->inUse[i * 16 + j] = True;
+            }
+      makeMaps_d ( s );
+      if (s->nInUse == 0) RETURN(BZ_DATA_ERROR);
+      alphaSize = s->nInUse+2;
+
+      /*--- Now the selectors ---*/
+      GET_BITS(BZ_X_SELECTOR_1, nGroups, 3);
+      if (nGroups < 2 || nGroups > 6) RETURN(BZ_DATA_ERROR);
+      GET_BITS(BZ_X_SELECTOR_2, nSelectors, 15);
+      if (nSelectors < 1) RETURN(BZ_DATA_ERROR);
+      for (i = 0; i < nSelectors; i++) {
+         j = 0;
+         while (True) {
+            GET_BIT(BZ_X_SELECTOR_3, uc);
+            if (uc == 0) break;
+            j++;
+            if (j >= nGroups) RETURN(BZ_DATA_ERROR);
+         }
+         s->selectorMtf[i] = j;
+      }
+
+      /*--- Undo the MTF values for the selectors. ---*/
+      {
+         UChar pos[BZ_N_GROUPS], tmp, v;
+         for (v = 0; v < nGroups; v++) pos[v] = v;
+   
+         for (i = 0; i < nSelectors; i++) {
+            v = s->selectorMtf[i];
+            tmp = pos[v];
+            while (v > 0) { pos[v] = pos[v-1]; v--; }
+            pos[0] = tmp;
+            s->selector[i] = tmp;
+         }
+      }
+
+      /*--- Now the coding tables ---*/
+      for (t = 0; t < nGroups; t++) {
+         GET_BITS(BZ_X_CODING_1, curr, 5);
+         for (i = 0; i < alphaSize; i++) {
+            while (True) {
+               if (curr < 1 || curr > 20) RETURN(BZ_DATA_ERROR);
+               GET_BIT(BZ_X_CODING_2, uc);
+               if (uc == 0) break;
+               GET_BIT(BZ_X_CODING_3, uc);
+               if (uc == 0) curr++; else curr--;
+            }
+            s->len[t][i] = curr;
+         }
+      }
+
+      /*--- Create the Huffman decoding tables ---*/
+      for (t = 0; t < nGroups; t++) {
+         minLen = 32;
+         maxLen = 0;
+         for (i = 0; i < alphaSize; i++) {
+            if (s->len[t][i] > maxLen) maxLen = s->len[t][i];
+            if (s->len[t][i] < minLen) minLen = s->len[t][i];
+         }
+         BZ2_hbCreateDecodeTables ( 
+            &(s->limit[t][0]), 
+            &(s->base[t][0]), 
+            &(s->perm[t][0]), 
+            &(s->len[t][0]),
+            minLen, maxLen, alphaSize
+         );
+         s->minLens[t] = minLen;
+      }
+
+      /*--- Now the MTF values ---*/
+
+      EOB      = s->nInUse+1;
+      nblockMAX = 100000 * s->blockSize100k;
+      groupNo  = -1;
+      groupPos = 0;
+
+      for (i = 0; i <= 255; i++) s->unzftab[i] = 0;
+
+      /*-- MTF init --*/
+      {
+         Int32 ii, jj, kk;
+         kk = MTFA_SIZE-1;
+         for (ii = 256 / MTFL_SIZE - 1; ii >= 0; ii--) {
+            for (jj = MTFL_SIZE-1; jj >= 0; jj--) {
+               s->mtfa[kk] = (UChar)(ii * MTFL_SIZE + jj);
+               kk--;
+            }
+            s->mtfbase[ii] = kk + 1;
+         }
+      }
+      /*-- end MTF init --*/
+
+      nblock = 0;
+      GET_MTF_VAL(BZ_X_MTF_1, BZ_X_MTF_2, nextSym);
+
+      while (True) {
+
+         if (nextSym == EOB) break;
+
+         if (nextSym == BZ_RUNA || nextSym == BZ_RUNB) {
+
+            es = -1;
+            N = 1;
+            do {
+               if (nextSym == BZ_RUNA) es = es + (0+1) * N; else
+               if (nextSym == BZ_RUNB) es = es + (1+1) * N;
+               N = N * 2;
+               GET_MTF_VAL(BZ_X_MTF_3, BZ_X_MTF_4, nextSym);
+            }
+               while (nextSym == BZ_RUNA || nextSym == BZ_RUNB);
+
+            es++;
+            uc = s->seqToUnseq[ s->mtfa[s->mtfbase[0]] ];
+            s->unzftab[uc] += es;
+
+            if (s->smallDecompress)
+               while (es > 0) {
+                  if (nblock >= nblockMAX) RETURN(BZ_DATA_ERROR);
+                  s->ll16[nblock] = (UInt16)uc;
+                  nblock++;
+                  es--;
+               }
+            else
+               while (es > 0) {
+                  if (nblock >= nblockMAX) RETURN(BZ_DATA_ERROR);
+                  s->tt[nblock] = (UInt32)uc;
+                  nblock++;
+                  es--;
+               };
+
+            continue;
+
+         } else {
+
+            if (nblock >= nblockMAX) RETURN(BZ_DATA_ERROR);
+
+            /*-- uc = MTF ( nextSym-1 ) --*/
+            {
+               Int32 ii, jj, kk, pp, lno, off;
+               UInt32 nn;
+               nn = (UInt32)(nextSym - 1);
+
+               if (nn < MTFL_SIZE) {
+                  /* avoid general-case expense */
+                  pp = s->mtfbase[0];
+                  uc = s->mtfa[pp+nn];
+                  while (nn > 3) {
+                     Int32 z = pp+nn;
+                     s->mtfa[(z)  ] = s->mtfa[(z)-1];
+                     s->mtfa[(z)-1] = s->mtfa[(z)-2];
+                     s->mtfa[(z)-2] = s->mtfa[(z)-3];
+                     s->mtfa[(z)-3] = s->mtfa[(z)-4];
+                     nn -= 4;
+                  }
+                  while (nn > 0) { 
+                     s->mtfa[(pp+nn)] = s->mtfa[(pp+nn)-1]; nn--; 
+                  };
+                  s->mtfa[pp] = uc;
+               } else { 
+                  /* general case */
+                  lno = nn / MTFL_SIZE;
+                  off = nn % MTFL_SIZE;
+                  pp = s->mtfbase[lno] + off;
+                  uc = s->mtfa[pp];
+                  while (pp > s->mtfbase[lno]) { 
+                     s->mtfa[pp] = s->mtfa[pp-1]; pp--; 
+                  };
+                  s->mtfbase[lno]++;
+                  while (lno > 0) {
+                     s->mtfbase[lno]--;
+                     s->mtfa[s->mtfbase[lno]] 
+                        = s->mtfa[s->mtfbase[lno-1] + MTFL_SIZE - 1];
+                     lno--;
+                  }
+                  s->mtfbase[0]--;
+                  s->mtfa[s->mtfbase[0]] = uc;
+                  if (s->mtfbase[0] == 0) {
+                     kk = MTFA_SIZE-1;
+                     for (ii = 256 / MTFL_SIZE-1; ii >= 0; ii--) {
+                        for (jj = MTFL_SIZE-1; jj >= 0; jj--) {
+                           s->mtfa[kk] = s->mtfa[s->mtfbase[ii] + jj];
+                           kk--;
+                        }
+                        s->mtfbase[ii] = kk + 1;
+                     }
+                  }
+               }
+            }
+            /*-- end uc = MTF ( nextSym-1 ) --*/
+
+            s->unzftab[s->seqToUnseq[uc]]++;
+            if (s->smallDecompress)
+               s->ll16[nblock] = (UInt16)(s->seqToUnseq[uc]); else
+               s->tt[nblock]   = (UInt32)(s->seqToUnseq[uc]);
+            nblock++;
+
+            GET_MTF_VAL(BZ_X_MTF_5, BZ_X_MTF_6, nextSym);
+            continue;
+         }
+      }
+
+      /* Now we know what nblock is, we can do a better sanity
+         check on s->origPtr.
+      */
+      if (s->origPtr < 0 || s->origPtr >= nblock)
+         RETURN(BZ_DATA_ERROR);
+
+      /*-- Set up cftab to facilitate generation of T^(-1) --*/
+      s->cftab[0] = 0;
+      for (i = 1; i <= 256; i++) s->cftab[i] = s->unzftab[i-1];
+      for (i = 1; i <= 256; i++) s->cftab[i] += s->cftab[i-1];
+      for (i = 0; i <= 256; i++) {
+         if (s->cftab[i] < 0 || s->cftab[i] > nblock) {
+            /* s->cftab[i] can legitimately be == nblock */
+            RETURN(BZ_DATA_ERROR);
+         }
+      }
+
+      s->state_out_len = 0;
+      s->state_out_ch  = 0;
+      BZ_INITIALISE_CRC ( s->calculatedBlockCRC );
+      s->state = BZ_X_OUTPUT;
+      if (s->verbosity >= 2) VPrintf0 ( "rt+rld" );
+
+      if (s->smallDecompress) {
+
+         /*-- Make a copy of cftab, used in generation of T --*/
+         for (i = 0; i <= 256; i++) s->cftabCopy[i] = s->cftab[i];
+
+         /*-- compute the T vector --*/
+         for (i = 0; i < nblock; i++) {
+            uc = (UChar)(s->ll16[i]);
+            SET_LL(i, s->cftabCopy[uc]);
+            s->cftabCopy[uc]++;
+         }
+
+         /*-- Compute T^(-1) by pointer reversal on T --*/
+         i = s->origPtr;
+         j = GET_LL(i);
+         do {
+            Int32 tmp = GET_LL(j);
+            SET_LL(j, i);
+            i = j;
+            j = tmp;
+         }
+            while (i != s->origPtr);
+
+         s->tPos = s->origPtr;
+         s->nblock_used = 0;
+         if (s->blockRandomised) {
+            BZ_RAND_INIT_MASK;
+            BZ_GET_SMALL(s->k0); s->nblock_used++;
+            BZ_RAND_UPD_MASK; s->k0 ^= BZ_RAND_MASK; 
+         } else {
+            BZ_GET_SMALL(s->k0); s->nblock_used++;
+         }
+
+      } else {
+
+         /*-- compute the T^(-1) vector --*/
+         for (i = 0; i < nblock; i++) {
+            uc = (UChar)(s->tt[i] & 0xff);
+            s->tt[s->cftab[uc]] |= (i << 8);
+            s->cftab[uc]++;
+         }
+
+         s->tPos = s->tt[s->origPtr] >> 8;
+         s->nblock_used = 0;
+         if (s->blockRandomised) {
+            BZ_RAND_INIT_MASK;
+            BZ_GET_FAST(s->k0); s->nblock_used++;
+            BZ_RAND_UPD_MASK; s->k0 ^= BZ_RAND_MASK; 
+         } else {
+            BZ_GET_FAST(s->k0); s->nblock_used++;
+         }
+
+      }
+
+      RETURN(BZ_OK);
+
+
+
+    endhdr_2:
+
+      GET_UCHAR(BZ_X_ENDHDR_2, uc);
+      if (uc != 0x72) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_ENDHDR_3, uc);
+      if (uc != 0x45) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_ENDHDR_4, uc);
+      if (uc != 0x38) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_ENDHDR_5, uc);
+      if (uc != 0x50) RETURN(BZ_DATA_ERROR);
+      GET_UCHAR(BZ_X_ENDHDR_6, uc);
+      if (uc != 0x90) RETURN(BZ_DATA_ERROR);
+
+      s->storedCombinedCRC = 0;
+      GET_UCHAR(BZ_X_CCRC_1, uc);
+      s->storedCombinedCRC = (s->storedCombinedCRC << 8) | ((UInt32)uc);
+      GET_UCHAR(BZ_X_CCRC_2, uc);
+      s->storedCombinedCRC = (s->storedCombinedCRC << 8) | ((UInt32)uc);
+      GET_UCHAR(BZ_X_CCRC_3, uc);
+      s->storedCombinedCRC = (s->storedCombinedCRC << 8) | ((UInt32)uc);
+      GET_UCHAR(BZ_X_CCRC_4, uc);
+      s->storedCombinedCRC = (s->storedCombinedCRC << 8) | ((UInt32)uc);
+
+      s->state = BZ_X_IDLE;
+      RETURN(BZ_STREAM_END);
+
+      default: AssertH ( False, 4001 );
+   }
+
+   AssertH ( False, 4002 );
+
+   save_state_and_return:
+
+   s->save_i           = i;
+   s->save_j           = j;
+   s->save_t           = t;
+   s->save_alphaSize   = alphaSize;
+   s->save_nGroups     = nGroups;
+   s->save_nSelectors  = nSelectors;
+   s->save_EOB         = EOB;
+   s->save_groupNo     = groupNo;
+   s->save_groupPos    = groupPos;
+   s->save_nextSym     = nextSym;
+   s->save_nblockMAX   = nblockMAX;
+   s->save_nblock      = nblock;
+   s->save_es          = es;
+   s->save_N           = N;
+   s->save_curr        = curr;
+   s->save_zt          = zt;
+   s->save_zn          = zn;
+   s->save_zvec        = zvec;
+   s->save_zj          = zj;
+   s->save_gSel        = gSel;
+   s->save_gMinlen     = gMinlen;
+   s->save_gLimit      = gLimit;
+   s->save_gBase       = gBase;
+   s->save_gPerm       = gPerm;
+
+   return retVal;   
+}
+
+
+/*-------------------------------------------------------------*/
+/*--- end                                      decompress.c ---*/
+/*-------------------------------------------------------------*/
+
+/*-------------------------------------------------------------*/
+/*--- Block sorting machinery                               ---*/
+/*---                                           blocksort.c ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+
+  To get some idea how the block sorting algorithms in this file 
+  work, read my paper 
+     On the Performance of BWT Sorting Algorithms
+  in Proceedings of the IEEE Data Compression Conference 2000,
+  Snowbird, Utah, USA, 27-30 March 2000.  The main sort in this
+  file implements the algorithm called  cache  in the paper.
+--*/
+
+
+
+/*---------------------------------------------*/
+/*--- Fallback O(N log(N)^2) sorting        ---*/
+/*--- algorithm, for repetitive blocks      ---*/
+/*---------------------------------------------*/
+
+/*---------------------------------------------*/
+static 
+__inline__
+void fallbackSimpleSort ( UInt32* fmap, 
+                          UInt32* eclass, 
+                          Int32   lo, 
+                          Int32   hi )
+{
+   Int32 i, j, tmp;
+   UInt32 ec_tmp;
+
+   if (lo == hi) return;
+
+   if (hi - lo > 3) {
+      for ( i = hi-4; i >= lo; i-- ) {
+         tmp = fmap[i];
+         ec_tmp = eclass[tmp];
+         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
+            fmap[j-4] = fmap[j];
+         fmap[j-4] = tmp;
+      }
+   }
+
+   for ( i = hi-1; i >= lo; i-- ) {
+      tmp = fmap[i];
+      ec_tmp = eclass[tmp];
+      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
+         fmap[j-1] = fmap[j];
+      fmap[j-1] = tmp;
+   }
+}
+
+
+/*---------------------------------------------*/
+#define fswap(zz1, zz2) \
+   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
+
+#define fvswap(zzp1, zzp2, zzn)       \
+{                                     \
+   Int32 yyp1 = (zzp1);               \
+   Int32 yyp2 = (zzp2);               \
+   Int32 yyn  = (zzn);                \
+   while (yyn > 0) {                  \
+      fswap(fmap[yyp1], fmap[yyp2]);  \
+      yyp1++; yyp2++; yyn--;          \
+   }                                  \
+}
+
+
+#define fmin(a,b) ((a) < (b)) ? (a) : (b)
+
+#define fpush(lz,hz) { stackLo[sp] = lz; \
+                       stackHi[sp] = hz; \
+                       sp++; }
+
+#define fpop(lz,hz) { sp--;              \
+                      lz = stackLo[sp];  \
+                      hz = stackHi[sp]; }
+
+#define FALLBACK_QSORT_SMALL_THRESH 10
+#define FALLBACK_QSORT_STACK_SIZE   100
+
+
+static
+void fallbackQSort3 ( UInt32* fmap, 
+                      UInt32* eclass,
+                      Int32   loSt, 
+                      Int32   hiSt )
+{
+   Int32 unLo, unHi, ltLo, gtHi, n, m;
+   Int32 sp, lo, hi;
+   UInt32 med, r, r3;
+   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
+   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
+
+   r = 0;
+
+   sp = 0;
+   fpush ( loSt, hiSt );
+
+   while (sp > 0) {
+
+      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
+
+      fpop ( lo, hi );
+      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
+         fallbackSimpleSort ( fmap, eclass, lo, hi );
+         continue;
+      }
+
+      /* Random partitioning.  Median of 3 sometimes fails to
+         avoid bad cases.  Median of 9 seems to help but 
+         looks rather expensive.  This too seems to work but
+         is cheaper.  Guidance for the magic constants 
+         7621 and 32768 is taken from Sedgewick's algorithms
+         book, chapter 35.
+      */
+      r = ((r * 7621) + 1) % 32768;
+      r3 = r % 3;
+      if (r3 == 0) med = eclass[fmap[lo]]; else
+      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
+                   med = eclass[fmap[hi]];
+
+      unLo = ltLo = lo;
+      unHi = gtHi = hi;
+
+      while (1) {
+         while (1) {
+            if (unLo > unHi) break;
+            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
+            if (n == 0) { 
+               fswap(fmap[unLo], fmap[ltLo]); 
+               ltLo++; unLo++; 
+               continue; 
+            };
+            if (n > 0) break;
+            unLo++;
+         }
+         while (1) {
+            if (unLo > unHi) break;
+            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
+            if (n == 0) { 
+               fswap(fmap[unHi], fmap[gtHi]); 
+               gtHi--; unHi--; 
+               continue; 
+            };
+            if (n < 0) break;
+            unHi--;
+         }
+         if (unLo > unHi) break;
+         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
+      }
+
+      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
+
+      if (gtHi < ltLo) continue;
+
+      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
+      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
+
+      n = lo + unLo - ltLo - 1;
+      m = hi - (gtHi - unHi) + 1;
+
+      if (n - lo > hi - m) {
+         fpush ( lo, n );
+         fpush ( m, hi );
+      } else {
+         fpush ( m, hi );
+         fpush ( lo, n );
+      }
+   }
+}
+
+#undef fmin
+#undef fpush
+#undef fpop
+#undef fswap
+#undef fvswap
+#undef FALLBACK_QSORT_SMALL_THRESH
+#undef FALLBACK_QSORT_STACK_SIZE
+
+
+/*---------------------------------------------*/
+/* Pre:
+      nblock > 0
+      eclass exists for [0 .. nblock-1]
+      ((UChar*)eclass) [0 .. nblock-1] holds block
+      ptr exists for [0 .. nblock-1]
+
+   Post:
+      ((UChar*)eclass) [0 .. nblock-1] holds block
+      All other areas of eclass destroyed
+      fmap [0 .. nblock-1] holds sorted order
+      bhtab [ 0 .. 2+(nblock/32) ] destroyed
+*/
+
+#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
+#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
+#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
+#define      WORD_BH(zz)  bhtab[(zz) >> 5]
+#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
+
+static
+void fallbackSort ( UInt32* fmap, 
+                    UInt32* eclass, 
+                    UInt32* bhtab,
+                    Int32   nblock,
+                    Int32   verb )
+{
+   Int32 ftab[257];
+   Int32 ftabCopy[256];
+   Int32 H, i, j, k, l, r, cc, cc1;
+   Int32 nNotDone;
+   Int32 nBhtab;
+   UChar* eclass8 = (UChar*)eclass;
+
+   /*--
+      Initial 1-char radix sort to generate
+      initial fmap and initial BH bits.
+   --*/
+   if (verb >= 4)
+      VPrintf0 ( "        bucket sorting ...\n" );
+   for (i = 0; i < 257;    i++) ftab[i] = 0;
+   for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
+   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
+   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
+
+   for (i = 0; i < nblock; i++) {
+      j = eclass8[i];
+      k = ftab[j] - 1;
+      ftab[j] = k;
+      fmap[k] = i;
+   }
+
+   nBhtab = 2 + (nblock / 32);
+   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
+   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
+
+   /*--
+      Inductively refine the buckets.  Kind-of an
+      "exponential radix sort" (!), inspired by the
+      Manber-Myers suffix array construction algorithm.
+   --*/
+
+   /*-- set sentinel bits for block-end detection --*/
+   for (i = 0; i < 32; i++) { 
+      SET_BH(nblock + 2*i);
+      CLEAR_BH(nblock + 2*i + 1);
+   }
+
+   /*-- the log(N) loop --*/
+   H = 1;
+   while (1) {
+
+      if (verb >= 4) 
+         VPrintf1 ( "        depth %6d has ", H );
+
+      j = 0;
+      for (i = 0; i < nblock; i++) {
+         if (ISSET_BH(i)) j = i;
+         k = fmap[i] - H; if (k < 0) k += nblock;
+         eclass[k] = j;
+      }
+
+      nNotDone = 0;
+      r = -1;
+      while (1) {
+
+	 /*-- find the next non-singleton bucket --*/
+         k = r + 1;
+         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
+         if (ISSET_BH(k)) {
+            while (WORD_BH(k) == 0xffffffff) k += 32;
+            while (ISSET_BH(k)) k++;
+         }
+         l = k - 1;
+         if (l >= nblock) break;
+         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
+         if (!ISSET_BH(k)) {
+            while (WORD_BH(k) == 0x00000000) k += 32;
+            while (!ISSET_BH(k)) k++;
+         }
+         r = k - 1;
+         if (r >= nblock) break;
+
+         /*-- now [l, r] bracket current bucket --*/
+         if (r > l) {
+            nNotDone += (r - l + 1);
+            fallbackQSort3 ( fmap, eclass, l, r );
+
+            /*-- scan bucket and generate header bits-- */
+            cc = -1;
+            for (i = l; i <= r; i++) {
+               cc1 = eclass[fmap[i]];
+               if (cc != cc1) { SET_BH(i); cc = cc1; };
+            }
+         }
+      }
+
+      if (verb >= 4) 
+         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
+
+      H *= 2;
+      if (H > nblock || nNotDone == 0) break;
+   }
+
+   /*-- 
+      Reconstruct the original block in
+      eclass8 [0 .. nblock-1], since the
+      previous phase destroyed it.
+   --*/
+   if (verb >= 4)
+      VPrintf0 ( "        reconstructing block ...\n" );
+   j = 0;
+   for (i = 0; i < nblock; i++) {
+      while (ftabCopy[j] == 0) j++;
+      ftabCopy[j]--;
+      eclass8[fmap[i]] = (UChar)j;
+   }
+   AssertH ( j < 256, 1005 );
+}
+
+#undef       SET_BH
+#undef     CLEAR_BH
+#undef     ISSET_BH
+#undef      WORD_BH
+#undef UNALIGNED_BH
+
+
+/*---------------------------------------------*/
+/*--- The main, O(N^2 log(N)) sorting       ---*/
+/*--- algorithm.  Faster for "normal"       ---*/
+/*--- non-repetitive blocks.                ---*/
+/*---------------------------------------------*/
+
+/*---------------------------------------------*/
+static
+__inline__
+Bool mainGtU ( UInt32  i1, 
+               UInt32  i2,
+               UChar*  block, 
+               UInt16* quadrant,
+               UInt32  nblock,
+               Int32*  budget )
+{
+   Int32  k;
+   UChar  c1, c2;
+   UInt16 s1, s2;
+
+   AssertD ( i1 != i2, "mainGtU" );
+   /* 1 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 2 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 3 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 4 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 5 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 6 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 7 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 8 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 9 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 10 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 11 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 12 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+
+   k = nblock + 8;
+
+   do {
+      /* 1 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 2 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 3 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 4 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 5 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 6 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 7 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 8 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+
+      if (i1 >= nblock) i1 -= nblock;
+      if (i2 >= nblock) i2 -= nblock;
+
+      k -= 8;
+      (*budget)--;
+   }
+      while (k >= 0);
+
+   return False;
+}
+
+
+/*---------------------------------------------*/
+/*--
+   Knuth's increments seem to work better
+   than Incerpi-Sedgewick here.  Possibly
+   because the number of elems to sort is
+   usually small, typically <= 20.
+--*/
+static
+Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
+                   9841, 29524, 88573, 265720,
+                   797161, 2391484 };
+
+static
+void mainSimpleSort ( UInt32* ptr,
+                      UChar*  block,
+                      UInt16* quadrant,
+                      Int32   nblock,
+                      Int32   lo, 
+                      Int32   hi, 
+                      Int32   d,
+                      Int32*  budget )
+{
+   Int32 i, j, h, bigN, hp;
+   UInt32 v;
+
+   bigN = hi - lo + 1;
+   if (bigN < 2) return;
+
+   hp = 0;
+   while (incs[hp] < bigN) hp++;
+   hp--;
+
+   for (; hp >= 0; hp--) {
+      h = incs[hp];
+
+      i = lo + h;
+      while (True) {
+
+         /*-- copy 1 --*/
+         if (i > hi) break;
+         v = ptr[i];
+         j = i;
+         while ( mainGtU ( 
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
+                 ) ) {
+            ptr[j] = ptr[j-h];
+            j = j - h;
+            if (j <= (lo + h - 1)) break;
+         }
+         ptr[j] = v;
+         i++;
+
+         /*-- copy 2 --*/
+         if (i > hi) break;
+         v = ptr[i];
+         j = i;
+         while ( mainGtU ( 
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
+                 ) ) {
+            ptr[j] = ptr[j-h];
+            j = j - h;
+            if (j <= (lo + h - 1)) break;
+         }
+         ptr[j] = v;
+         i++;
+
+         /*-- copy 3 --*/
+         if (i > hi) break;
+         v = ptr[i];
+         j = i;
+         while ( mainGtU ( 
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
+                 ) ) {
+            ptr[j] = ptr[j-h];
+            j = j - h;
+            if (j <= (lo + h - 1)) break;
+         }
+         ptr[j] = v;
+         i++;
+
+         if (*budget < 0) return;
+      }
+   }
+}
+
+
+/*---------------------------------------------*/
+/*--
+   The following is an implementation of
+   an elegant 3-way quicksort for strings,
+   described in a paper "Fast Algorithms for
+   Sorting and Searching Strings", by Robert
+   Sedgewick and Jon L. Bentley.
+--*/
+
+#define mswap(zz1, zz2) \
+   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
+
+#define mvswap(zzp1, zzp2, zzn)       \
+{                                     \
+   Int32 yyp1 = (zzp1);               \
+   Int32 yyp2 = (zzp2);               \
+   Int32 yyn  = (zzn);                \
+   while (yyn > 0) {                  \
+      mswap(ptr[yyp1], ptr[yyp2]);    \
+      yyp1++; yyp2++; yyn--;          \
+   }                                  \
+}
+
+static 
+__inline__
+UChar mmed3 ( UChar a, UChar b, UChar c )
+{
+   UChar t;
+   if (a > b) { t = a; a = b; b = t; };
+   if (b > c) { 
+      b = c;
+      if (a > b) b = a;
+   }
+   return b;
+}
+
+#define mmin(a,b) ((a) < (b)) ? (a) : (b)
+
+#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
+                          stackHi[sp] = hz; \
+                          stackD [sp] = dz; \
+                          sp++; }
+
+#define mpop(lz,hz,dz) { sp--;             \
+                         lz = stackLo[sp]; \
+                         hz = stackHi[sp]; \
+                         dz = stackD [sp]; }
+
+
+#define mnextsize(az) (nextHi[az]-nextLo[az])
+
+#define mnextswap(az,bz)                                        \
+   { Int32 tz;                                                  \
+     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
+     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
+     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
+
+
+#define MAIN_QSORT_SMALL_THRESH 20
+#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
+#define MAIN_QSORT_STACK_SIZE 100
+
+static
+void mainQSort3 ( UInt32* ptr,
+                  UChar*  block,
+                  UInt16* quadrant,
+                  Int32   nblock,
+                  Int32   loSt, 
+                  Int32   hiSt, 
+                  Int32   dSt,
+                  Int32*  budget )
+{
+   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
+   Int32 sp, lo, hi, d;
+
+   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
+   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
+   Int32 stackD [MAIN_QSORT_STACK_SIZE];
+
+   Int32 nextLo[3];
+   Int32 nextHi[3];
+   Int32 nextD [3];
+
+   sp = 0;
+   mpush ( loSt, hiSt, dSt );
+
+   while (sp > 0) {
+
+      AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
+
+      mpop ( lo, hi, d );
+      if (hi - lo < MAIN_QSORT_SMALL_THRESH || 
+          d > MAIN_QSORT_DEPTH_THRESH) {
+         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
+         if (*budget < 0) return;
+         continue;
+      }
+
+      med = (Int32) 
+            mmed3 ( block[ptr[ lo         ]+d],
+                    block[ptr[ hi         ]+d],
+                    block[ptr[ (lo+hi)>>1 ]+d] );
+
+      unLo = ltLo = lo;
+      unHi = gtHi = hi;
+
+      while (True) {
+         while (True) {
+            if (unLo > unHi) break;
+            n = ((Int32)block[ptr[unLo]+d]) - med;
+            if (n == 0) { 
+               mswap(ptr[unLo], ptr[ltLo]); 
+               ltLo++; unLo++; continue; 
+            };
+            if (n >  0) break;
+            unLo++;
+         }
+         while (True) {
+            if (unLo > unHi) break;
+            n = ((Int32)block[ptr[unHi]+d]) - med;
+            if (n == 0) { 
+               mswap(ptr[unHi], ptr[gtHi]); 
+               gtHi--; unHi--; continue; 
+            };
+            if (n <  0) break;
+            unHi--;
+         }
+         if (unLo > unHi) break;
+         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
+      }
+
+      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
+
+      if (gtHi < ltLo) {
+         mpush(lo, hi, d+1 );
+         continue;
+      }
+
+      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
+      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
+
+      n = lo + unLo - ltLo - 1;
+      m = hi - (gtHi - unHi) + 1;
+
+      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
+      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
+      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
+
+      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
+      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+
+      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
+      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
+
+      mpush (nextLo[0], nextHi[0], nextD[0]);
+      mpush (nextLo[1], nextHi[1], nextD[1]);
+      mpush (nextLo[2], nextHi[2], nextD[2]);
+   }
+}
+
+#undef mswap
+#undef mvswap
+#undef mpush
+#undef mpop
+#undef mmin
+#undef mnextsize
+#undef mnextswap
+#undef MAIN_QSORT_SMALL_THRESH
+#undef MAIN_QSORT_DEPTH_THRESH
+#undef MAIN_QSORT_STACK_SIZE
+
+
+/*---------------------------------------------*/
+/* Pre:
+      nblock > N_OVERSHOOT
+      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
+      ((UChar*)block32) [0 .. nblock-1] holds block
+      ptr exists for [0 .. nblock-1]
+
+   Post:
+      ((UChar*)block32) [0 .. nblock-1] holds block
+      All other areas of block32 destroyed
+      ftab [0 .. 65536 ] destroyed
+      ptr [0 .. nblock-1] holds sorted order
+      if (*budget < 0), sorting was abandoned
+*/
+
+#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
+#define SETMASK (1 << 21)
+#define CLEARMASK (~(SETMASK))
+
+static
+void mainSort ( UInt32* ptr, 
+                UChar*  block,
+                UInt16* quadrant, 
+                UInt32* ftab,
+                Int32   nblock,
+                Int32   verb,
+                Int32*  budget )
+{
+   Int32  i, j, k, ss, sb;
+   Int32  runningOrder[256];
+   Bool   bigDone[256];
+   Int32  copyStart[256];
+   Int32  copyEnd  [256];
+   UChar  c1;
+   Int32  numQSorted;
+   UInt16 s;
+   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
+
+   /*-- set up the 2-byte frequency table --*/
+   for (i = 65536; i >= 0; i--) ftab[i] = 0;
+
+   j = block[0] << 8;
+   i = nblock-1;
+   for (; i >= 3; i -= 4) {
+      quadrant[i] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
+      ftab[j]++;
+      quadrant[i-1] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
+      ftab[j]++;
+      quadrant[i-2] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
+      ftab[j]++;
+      quadrant[i-3] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
+      ftab[j]++;
+   }
+   for (; i >= 0; i--) {
+      quadrant[i] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
+      ftab[j]++;
+   }
+
+   /*-- (emphasises close relationship of block & quadrant) --*/
+   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
+      block   [nblock+i] = block[i];
+      quadrant[nblock+i] = 0;
+   }
+
+   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
+
+   /*-- Complete the initial radix sort --*/
+   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
+
+   s = block[0] << 8;
+   i = nblock-1;
+   for (; i >= 3; i -= 4) {
+      s = (s >> 8) | (block[i] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i;
+      s = (s >> 8) | (block[i-1] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i-1;
+      s = (s >> 8) | (block[i-2] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i-2;
+      s = (s >> 8) | (block[i-3] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i-3;
+   }
+   for (; i >= 0; i--) {
+      s = (s >> 8) | (block[i] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i;
+   }
+
+   /*--
+      Now ftab contains the first loc of every small bucket.
+      Calculate the running order, from smallest to largest
+      big bucket.
+   --*/
+   for (i = 0; i <= 255; i++) {
+      bigDone     [i] = False;
+      runningOrder[i] = i;
+   }
+
+   {
+      Int32 vv;
+      Int32 h = 1;
+      do h = 3 * h + 1; while (h <= 256);
+      do {
+         h = h / 3;
+         for (i = h; i <= 255; i++) {
+            vv = runningOrder[i];
+            j = i;
+            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
+               runningOrder[j] = runningOrder[j-h];
+               j = j - h;
+               if (j <= (h - 1)) goto zero;
+            }
+            zero:
+            runningOrder[j] = vv;
+         }
+      } while (h != 1);
+   }
+
+   /*--
+      The main sorting loop.
+   --*/
+
+   numQSorted = 0;
+
+   for (i = 0; i <= 255; i++) {
+
+      /*--
+         Process big buckets, starting with the least full.
+         Basically this is a 3-step process in which we call
+         mainQSort3 to sort the small buckets [ss, j], but
+         also make a big effort to avoid the calls if we can.
+      --*/
+      ss = runningOrder[i];
+
+      /*--
+         Step 1:
+         Complete the big bucket [ss] by quicksorting
+         any unsorted small buckets [ss, j], for j != ss.  
+         Hopefully previous pointer-scanning phases have already
+         completed many of the small buckets [ss, j], so
+         we don't have to sort them at all.
+      --*/
+      for (j = 0; j <= 255; j++) {
+         if (j != ss) {
+            sb = (ss << 8) + j;
+            if ( ! (ftab[sb] & SETMASK) ) {
+               Int32 lo = ftab[sb]   & CLEARMASK;
+               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
+               if (hi > lo) {
+                  if (verb >= 4)
+                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
+                                "done %d   this %d\n",
+                                ss, j, numQSorted, hi - lo + 1 );
+                  mainQSort3 ( 
+                     ptr, block, quadrant, nblock, 
+                     lo, hi, BZ_N_RADIX, budget 
+                  );   
+                  numQSorted += (hi - lo + 1);
+                  if (*budget < 0) return;
+               }
+            }
+            ftab[sb] |= SETMASK;
+         }
+      }
+
+      AssertH ( !bigDone[ss], 1006 );
+
+      /*--
+         Step 2:
+         Now scan this big bucket [ss] so as to synthesise the
+         sorted order for small buckets [t, ss] for all t,
+         including, magically, the bucket [ss,ss] too.
+         This will avoid doing Real Work in subsequent Step 1's.
+      --*/
+      {
+         for (j = 0; j <= 255; j++) {
+            copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
+            copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
+         }
+         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
+            k = ptr[j]-1; if (k < 0) k += nblock;
+            c1 = block[k];
+            if (!bigDone[c1])
+               ptr[ copyStart[c1]++ ] = k;
+         }
+         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
+            k = ptr[j]-1; if (k < 0) k += nblock;
+            c1 = block[k];
+            if (!bigDone[c1]) 
+               ptr[ copyEnd[c1]-- ] = k;
+         }
+      }
+
+      AssertH ( (copyStart[ss]-1 == copyEnd[ss])
+                || 
+                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
+                   Necessity for this case is demonstrated by compressing 
+                   a sequence of approximately 48.5 million of character 
+                   251; 1.0.0/1.0.1 will then die here. */
+                (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
+                1007 )
+
+      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
+
+      /*--
+         Step 3:
+         The [ss] big bucket is now done.  Record this fact,
+         and update the quadrant descriptors.  Remember to
+         update quadrants in the overshoot area too, if
+         necessary.  The "if (i < 255)" test merely skips
+         this updating for the last bucket processed, since
+         updating for the last bucket is pointless.
+
+         The quadrant array provides a way to incrementally
+         cache sort orderings, as they appear, so as to 
+         make subsequent comparisons in fullGtU() complete
+         faster.  For repetitive blocks this makes a big
+         difference (but not big enough to be able to avoid
+         the fallback sorting mechanism, exponential radix sort).
+
+         The precise meaning is: at all times:
+
+            for 0 <= i < nblock and 0 <= j <= nblock
+
+            if block[i] != block[j], 
+
+               then the relative values of quadrant[i] and 
+                    quadrant[j] are meaningless.
+
+               else {
+                  if quadrant[i] < quadrant[j]
+                     then the string starting at i lexicographically
+                     precedes the string starting at j
+
+                  else if quadrant[i] > quadrant[j]
+                     then the string starting at j lexicographically
+                     precedes the string starting at i
+
+                  else
+                     the relative ordering of the strings starting
+                     at i and j has not yet been determined.
+               }
+      --*/
+      bigDone[ss] = True;
+
+      if (i < 255) {
+         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
+         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
+         Int32 shifts   = 0;
+
+         while ((bbSize >> shifts) > 65534) shifts++;
+
+         for (j = bbSize-1; j >= 0; j--) {
+            Int32 a2update     = ptr[bbStart + j];
+            UInt16 qVal        = (UInt16)(j >> shifts);
+            quadrant[a2update] = qVal;
+            if (a2update < BZ_N_OVERSHOOT)
+               quadrant[a2update + nblock] = qVal;
+         }
+         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
+      }
+
+   }
+
+   if (verb >= 4)
+      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
+                 nblock, numQSorted, nblock - numQSorted );
+}
+
+#undef BIGFREQ
+#undef SETMASK
+#undef CLEARMASK
+
+
+/*---------------------------------------------*/
+/* Pre:
+      nblock > 0
+      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
+      ((UChar*)arr2)  [0 .. nblock-1] holds block
+      arr1 exists for [0 .. nblock-1]
+
+   Post:
+      ((UChar*)arr2) [0 .. nblock-1] holds block
+      All other areas of block destroyed
+      ftab [ 0 .. 65536 ] destroyed
+      arr1 [0 .. nblock-1] holds sorted order
+*/
+void BZ2_blockSort ( EState* s )
+{
+   UInt32* ptr    = s->ptr; 
+   UChar*  block  = s->block;
+   UInt32* ftab   = s->ftab;
+   Int32   nblock = s->nblock;
+   Int32   verb   = s->verbosity;
+   Int32   wfact  = s->workFactor;
+   UInt16* quadrant;
+   Int32   budget;
+   Int32   budgetInit;
+   Int32   i;
+
+   if (nblock < /* 10000 */1000 ) {
+      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
+   } else {
+      /* Calculate the location for quadrant, remembering to get
+         the alignment right.  Assumes that &(block[0]) is at least
+         2-byte aligned -- this should be ok since block is really
+         the first section of arr2.
+      */
+      i = nblock+BZ_N_OVERSHOOT;
+      if (i & 1) i++;
+      quadrant = (UInt16*)(&(block[i]));
+
+      /* (wfact-1) / 3 puts the default-factor-30
+         transition point at very roughly the same place as 
+         with v0.1 and v0.9.0.  
+         Not that it particularly matters any more, since the
+         resulting compressed stream is now the same regardless
+         of whether or not we use the main sort or fallback sort.
+      */
+      if (wfact < 1  ) wfact = 1;
+      if (wfact > 100) wfact = 100;
+      budgetInit = nblock * ((wfact-1) / 3);
+      budget = budgetInit;
+
+      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
+      if (0 && verb >= 3) 
+         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
+                    budgetInit - budget,
+                    nblock, 
+                    (float)(budgetInit - budget) /
+                    (float)(nblock==0 ? 1 : nblock) ); 
+      if (budget < 0) {
+         if (verb >= 2) 
+            VPrintf0 ( "    too repetitive; using fallback"
+                       " sorting algorithm\n" );
+         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
+      }
+   }
+
+   s->origPtr = -1;
+   for (i = 0; i < s->nblock; i++)
+      if (ptr[i] == 0)
+         { s->origPtr = i; break; };
+
+   AssertH( s->origPtr != -1, 1003 );
+}
+
+
+/*-------------------------------------------------------------*/
+/*--- end                                       blocksort.c ---*/
+/*-------------------------------------------------------------*/
+
+/*-------------------------------------------------------------*/
+/*--- Huffman coding low-level stuff                        ---*/
+/*---                                             huffman.c ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+--*/
+
+
+
+/*---------------------------------------------------*/
+#define WEIGHTOF(zz0)  ((zz0) & 0xffffff00)
+#define DEPTHOF(zz1)   ((zz1) & 0x000000ff)
+#define MYMAX(zz2,zz3) ((zz2) > (zz3) ? (zz2) : (zz3))
+
+#define ADDWEIGHTS(zw1,zw2)                           \
+   (WEIGHTOF(zw1)+WEIGHTOF(zw2)) |                    \
+   (1 + MYMAX(DEPTHOF(zw1),DEPTHOF(zw2)))
+
+#define UPHEAP(z)                                     \
+{                                                     \
+   Int32 zz, tmp;                                     \
+   zz = z; tmp = heap[zz];                            \
+   while (weight[tmp] < weight[heap[zz >> 1]]) {      \
+      heap[zz] = heap[zz >> 1];                       \
+      zz >>= 1;                                       \
+   }                                                  \
+   heap[zz] = tmp;                                    \
+}
+
+#define DOWNHEAP(z)                                   \
+{                                                     \
+   Int32 zz, yy, tmp;                                 \
+   zz = z; tmp = heap[zz];                            \
+   while (True) {                                     \
+      yy = zz << 1;                                   \
+      if (yy > nHeap) break;                          \
+      if (yy < nHeap &&                               \
+          weight[heap[yy+1]] < weight[heap[yy]])      \
+         yy++;                                        \
+      if (weight[tmp] < weight[heap[yy]]) break;      \
+      heap[zz] = heap[yy];                            \
+      zz = yy;                                        \
+   }                                                  \
+   heap[zz] = tmp;                                    \
+}
+
+
+/*---------------------------------------------------*/
+void BZ2_hbMakeCodeLengths ( UChar *len, 
+                             Int32 *freq,
+                             Int32 alphaSize,
+                             Int32 maxLen )
+{
+   /*--
+      Nodes and heap entries run from 1.  Entry 0
+      for both the heap and nodes is a sentinel.
+   --*/
+   Int32 nNodes, nHeap, n1, n2, i, j, k;
+   Bool  tooLong;
+
+   Int32 heap   [ BZ_MAX_ALPHA_SIZE + 2 ];
+   Int32 weight [ BZ_MAX_ALPHA_SIZE * 2 ];
+   Int32 parent [ BZ_MAX_ALPHA_SIZE * 2 ]; 
+
+   for (i = 0; i < alphaSize; i++)
+      weight[i+1] = (freq[i] == 0 ? 1 : freq[i]) << 8;
+
+   while (True) {
+
+      nNodes = alphaSize;
+      nHeap = 0;
+
+      heap[0] = 0;
+      weight[0] = 0;
+      parent[0] = -2;
+
+      for (i = 1; i <= alphaSize; i++) {
+         parent[i] = -1;
+         nHeap++;
+         heap[nHeap] = i;
+         UPHEAP(nHeap);
+      }
+
+      AssertH( nHeap < (BZ_MAX_ALPHA_SIZE+2), 2001 );
+   
+      while (nHeap > 1) {
+         n1 = heap[1]; heap[1] = heap[nHeap]; nHeap--; DOWNHEAP(1);
+         n2 = heap[1]; heap[1] = heap[nHeap]; nHeap--; DOWNHEAP(1);
+         nNodes++;
+         parent[n1] = parent[n2] = nNodes;
+         weight[nNodes] = ADDWEIGHTS(weight[n1], weight[n2]);
+         parent[nNodes] = -1;
+         nHeap++;
+         heap[nHeap] = nNodes;
+         UPHEAP(nHeap);
+      }
+
+      AssertH( nNodes < (BZ_MAX_ALPHA_SIZE * 2), 2002 );
+
+      tooLong = False;
+      for (i = 1; i <= alphaSize; i++) {
+         j = 0;
+         k = i;
+         while (parent[k] >= 0) { k = parent[k]; j++; }
+         len[i-1] = j;
+         if (j > maxLen) tooLong = True;
+      }
+      
+      if (! tooLong) break;
+
+      /* 17 Oct 04: keep-going condition for the following loop used
+         to be 'i < alphaSize', which missed the last element,
+         theoretically leading to the possibility of the compressor
+         looping.  However, this count-scaling step is only needed if
+         one of the generated Huffman code words is longer than
+         maxLen, which up to and including version 1.0.2 was 20 bits,
+         which is extremely unlikely.  In version 1.0.3 maxLen was
+         changed to 17 bits, which has minimal effect on compression
+         ratio, but does mean this scaling step is used from time to
+         time, enough to verify that it works.
+
+         This means that bzip2-1.0.3 and later will only produce
+         Huffman codes with a maximum length of 17 bits.  However, in
+         order to preserve backwards compatibility with bitstreams
+         produced by versions pre-1.0.3, the decompressor must still
+         handle lengths of up to 20. */
+
+      for (i = 1; i <= alphaSize; i++) {
+         j = weight[i] >> 8;
+         j = 1 + (j / 2);
+         weight[i] = j << 8;
+      }
+   }
+}
+
+
+/*---------------------------------------------------*/
+void BZ2_hbAssignCodes ( Int32 *code,
+                         UChar *length,
+                         Int32 minLen,
+                         Int32 maxLen,
+                         Int32 alphaSize )
+{
+   Int32 n, vec, i;
+
+   vec = 0;
+   for (n = minLen; n <= maxLen; n++) {
+      for (i = 0; i < alphaSize; i++)
+         if (length[i] == n) { code[i] = vec; vec++; };
+      vec <<= 1;
+   }
+}
+
+
+/*---------------------------------------------------*/
+void BZ2_hbCreateDecodeTables ( Int32 *limit,
+                                Int32 *base,
+                                Int32 *perm,
+                                UChar *length,
+                                Int32 minLen,
+                                Int32 maxLen,
+                                Int32 alphaSize )
+{
+   Int32 pp, i, j, vec;
+
+   pp = 0;
+   for (i = minLen; i <= maxLen; i++)
+      for (j = 0; j < alphaSize; j++)
+         if (length[j] == i) { perm[pp] = j; pp++; };
+
+   for (i = 0; i < BZ_MAX_CODE_LEN; i++) base[i] = 0;
+   for (i = 0; i < alphaSize; i++) base[length[i]+1]++;
+
+   for (i = 1; i < BZ_MAX_CODE_LEN; i++) base[i] += base[i-1];
+
+   for (i = 0; i < BZ_MAX_CODE_LEN; i++) limit[i] = 0;
+   vec = 0;
+
+   for (i = minLen; i <= maxLen; i++) {
+      vec += (base[i+1] - base[i]);
+      limit[i] = vec-1;
+      vec <<= 1;
+   }
+   for (i = minLen + 1; i <= maxLen; i++)
+      base[i] = ((limit[i-1] + 1) << 1) - base[i];
+}
+
+
+/*-------------------------------------------------------------*/
+/*--- end                                         huffman.c ---*/
+/*-------------------------------------------------------------*/
+
+/*-------------------------------------------------------------*/
+/*--- Compression machinery (not incl block sorting)        ---*/
+/*---                                            compress.c ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+--*/
+
+/*--
+   CHANGES
+   ~~~~~~~
+   0.9.0 -- original version.
+
+   0.9.0a/b -- no changes in this file.
+
+   0.9.0c
+      * changed setting of nGroups in sendMTFValues() so as to 
+        do a bit better on small files
+--*/
+
+
+
+/*---------------------------------------------------*/
+/*--- Bit stream I/O                              ---*/
+/*---------------------------------------------------*/
+
+/*---------------------------------------------------*/
+void BZ2_bsInitWrite ( EState* s )
+{
+   s->bsLive = 0;
+   s->bsBuff = 0;
+}
+
+
+/*---------------------------------------------------*/
+static
+void bsFinishWrite ( EState* s )
+{
+   while (s->bsLive > 0) {
+      s->zbits[s->numZ] = (UChar)(s->bsBuff >> 24);
+      s->numZ++;
+      s->bsBuff <<= 8;
+      s->bsLive -= 8;
+   }
+}
+
+
+/*---------------------------------------------------*/
+#define bsNEEDW(nz)                           \
+{                                             \
+   while (s->bsLive >= 8) {                   \
+      s->zbits[s->numZ]                       \
+         = (UChar)(s->bsBuff >> 24);          \
+      s->numZ++;                              \
+      s->bsBuff <<= 8;                        \
+      s->bsLive -= 8;                         \
+   }                                          \
+}
+
+
+/*---------------------------------------------------*/
+static
+__inline__
+void bsW ( EState* s, Int32 n, UInt32 v )
+{
+   bsNEEDW ( n );
+   s->bsBuff |= (v << (32 - s->bsLive - n));
+   s->bsLive += n;
+}
+
+
+/*---------------------------------------------------*/
+static
+void bsPutUInt32 ( EState* s, UInt32 u )
+{
+   bsW ( s, 8, (u >> 24) & 0xffL );
+   bsW ( s, 8, (u >> 16) & 0xffL );
+   bsW ( s, 8, (u >>  8) & 0xffL );
+   bsW ( s, 8,  u        & 0xffL );
+}
+
+
+/*---------------------------------------------------*/
+static
+void bsPutUChar ( EState* s, UChar c )
+{
+   bsW( s, 8, (UInt32)c );
+}
+
+
+/*---------------------------------------------------*/
+/*--- The back end proper                         ---*/
+/*---------------------------------------------------*/
+
+/*---------------------------------------------------*/
+static
+void makeMaps_e ( EState* s )
+{
+   Int32 i;
+   s->nInUse = 0;
+   for (i = 0; i < 256; i++)
+      if (s->inUse[i]) {
+         s->unseqToSeq[i] = s->nInUse;
+         s->nInUse++;
+      }
+}
+
+
+/*---------------------------------------------------*/
+static
+void generateMTFValues ( EState* s )
+{
+   UChar   yy[256];
+   Int32   i, j;
+   Int32   zPend;
+   Int32   wr;
+   Int32   EOB;
+
+   /* 
+      After sorting (eg, here),
+         s->arr1 [ 0 .. s->nblock-1 ] holds sorted order,
+         and
+         ((UChar*)s->arr2) [ 0 .. s->nblock-1 ] 
+         holds the original block data.
+
+      The first thing to do is generate the MTF values,
+      and put them in
+         ((UInt16*)s->arr1) [ 0 .. s->nblock-1 ].
+      Because there are strictly fewer or equal MTF values
+      than block values, ptr values in this area are overwritten
+      with MTF values only when they are no longer needed.
+
+      The final compressed bitstream is generated into the
+      area starting at
+         (UChar*) (&((UChar*)s->arr2)[s->nblock])
+
+      These storage aliases are set up in bzCompressInit(),
+      except for the last one, which is arranged in 
+      compressBlock().
+   */
+   UInt32* ptr   = s->ptr;
+   UChar* block  = s->block;
+   UInt16* mtfv  = s->mtfv;
+
+   makeMaps_e ( s );
+   EOB = s->nInUse+1;
+
+   for (i = 0; i <= EOB; i++) s->mtfFreq[i] = 0;
+
+   wr = 0;
+   zPend = 0;
+   for (i = 0; i < s->nInUse; i++) yy[i] = (UChar) i;
+
+   for (i = 0; i < s->nblock; i++) {
+      UChar ll_i;
+      AssertD ( wr <= i, "generateMTFValues(1)" );
+      j = ptr[i]-1; if (j < 0) j += s->nblock;
+      ll_i = s->unseqToSeq[block[j]];
+      AssertD ( ll_i < s->nInUse, "generateMTFValues(2a)" );
+
+      if (yy[0] == ll_i) { 
+         zPend++;
+      } else {
+
+         if (zPend > 0) {
+            zPend--;
+            while (True) {
+               if (zPend & 1) {
+                  mtfv[wr] = BZ_RUNB; wr++; 
+                  s->mtfFreq[BZ_RUNB]++; 
+               } else {
+                  mtfv[wr] = BZ_RUNA; wr++; 
+                  s->mtfFreq[BZ_RUNA]++; 
+               }
+               if (zPend < 2) break;
+               zPend = (zPend - 2) / 2;
+            };
+            zPend = 0;
+         }
+         {
+            register UChar  rtmp;
+            register UChar* ryy_j;
+            register UChar  rll_i;
+            rtmp  = yy[1];
+            yy[1] = yy[0];
+            ryy_j = &(yy[1]);
+            rll_i = ll_i;
+            while ( rll_i != rtmp ) {
+               register UChar rtmp2;
+               ryy_j++;
+               rtmp2  = rtmp;
+               rtmp   = *ryy_j;
+               *ryy_j = rtmp2;
+            };
+            yy[0] = rtmp;
+            j = ryy_j - &(yy[0]);
+            mtfv[wr] = j+1; wr++; s->mtfFreq[j+1]++;
+         }
+
+      }
+   }
+
+   if (zPend > 0) {
+      zPend--;
+      while (True) {
+         if (zPend & 1) {
+            mtfv[wr] = BZ_RUNB; wr++; 
+            s->mtfFreq[BZ_RUNB]++; 
+         } else {
+            mtfv[wr] = BZ_RUNA; wr++; 
+            s->mtfFreq[BZ_RUNA]++; 
+         }
+         if (zPend < 2) break;
+         zPend = (zPend - 2) / 2;
+      };
+      zPend = 0;
+   }
+
+   mtfv[wr] = EOB; wr++; s->mtfFreq[EOB]++;
+
+   s->nMTF = wr;
+}
+
+
+/*---------------------------------------------------*/
+#define BZ_LESSER_ICOST  0
+#define BZ_GREATER_ICOST 15
+
+static
+void sendMTFValues ( EState* s )
+{
+   Int32 v, t, i, j, gs, ge, totc, bt, bc, iter;
+   Int32 nSelectors, alphaSize, minLen, maxLen, selCtr;
+   Int32 nGroups, nBytes;
+
+   /*--
+   UChar  len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+   is a global since the decoder also needs it.
+
+   Int32  code[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+   Int32  rfreq[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
+   are also globals only used in this proc.
+   Made global to keep stack frame size small.
+   --*/
+
+
+   UInt16 cost[BZ_N_GROUPS];
+   Int32  fave[BZ_N_GROUPS];
+
+   UInt16* mtfv = s->mtfv;
+
+   if (s->verbosity >= 3)
+      VPrintf3( "      %d in block, %d after MTF & 1-2 coding, "
+                "%d+2 syms in use\n", 
+                s->nblock, s->nMTF, s->nInUse );
+
+   alphaSize = s->nInUse+2;
+   for (t = 0; t < BZ_N_GROUPS; t++)
+      for (v = 0; v < alphaSize; v++)
+         s->len[t][v] = BZ_GREATER_ICOST;
+
+   /*--- Decide how many coding tables to use ---*/
+   AssertH ( s->nMTF > 0, 3001 );
+   if (s->nMTF < 200)  nGroups = 2; else
+   if (s->nMTF < 600)  nGroups = 3; else
+   if (s->nMTF < 1200) nGroups = 4; else
+   if (s->nMTF < 2400) nGroups = 5; else
+                       nGroups = 6;
+
+   /*--- Generate an initial set of coding tables ---*/
+   { 
+      Int32 nPart, remF, tFreq, aFreq;
+
+      nPart = nGroups;
+      remF  = s->nMTF;
+      gs = 0;
+      while (nPart > 0) {
+         tFreq = remF / nPart;
+         ge = gs-1;
+         aFreq = 0;
+         while (aFreq < tFreq && ge < alphaSize-1) {
+            ge++;
+            aFreq += s->mtfFreq[ge];
+         }
+
+         if (ge > gs 
+             && nPart != nGroups && nPart != 1 
+             && ((nGroups-nPart) % 2 == 1)) {
+            aFreq -= s->mtfFreq[ge];
+            ge--;
+         }
+
+         if (0 && s->verbosity >= 3)
+            VPrintf5( "      initial group %d, [%d .. %d], "
+                      "has %d syms (%4.1f%%)\n",
+                      nPart, gs, ge, aFreq, 
+                      (100.0 * (float)aFreq) / (float)(s->nMTF) );
+ 
+         for (v = 0; v < alphaSize; v++)
+            if (v >= gs && v <= ge) 
+               s->len[nPart-1][v] = BZ_LESSER_ICOST; else
+               s->len[nPart-1][v] = BZ_GREATER_ICOST;
+ 
+         nPart--;
+         gs = ge+1;
+         remF -= aFreq;
+      }
+   }
+
+   /*--- 
+      Iterate up to BZ_N_ITERS times to improve the tables.
+   ---*/
+   for (iter = 0; iter < BZ_N_ITERS; iter++) {
+
+      for (t = 0; t < nGroups; t++) fave[t] = 0;
+
+      for (t = 0; t < nGroups; t++)
+         for (v = 0; v < alphaSize; v++)
+            s->rfreq[t][v] = 0;
+
+      /*---
+        Set up an auxiliary length table which is used to fast-track
+	the common case (nGroups == 6). 
+      ---*/
+      if (nGroups == 6) {
+         for (v = 0; v < alphaSize; v++) {
+            s->len_pack[v][0] = (s->len[1][v] << 16) | s->len[0][v];
+            s->len_pack[v][1] = (s->len[3][v] << 16) | s->len[2][v];
+            s->len_pack[v][2] = (s->len[5][v] << 16) | s->len[4][v];
+	 }
+      }
+
+      nSelectors = 0;
+      totc = 0;
+      gs = 0;
+      while (True) {
+
+         /*--- Set group start & end marks. --*/
+         if (gs >= s->nMTF) break;
+         ge = gs + BZ_G_SIZE - 1; 
+         if (ge >= s->nMTF) ge = s->nMTF-1;
+
+         /*-- 
+            Calculate the cost of this group as coded
+            by each of the coding tables.
+         --*/
+         for (t = 0; t < nGroups; t++) cost[t] = 0;
+
+         if (nGroups == 6 && 50 == ge-gs+1) {
+            /*--- fast track the common case ---*/
+            register UInt32 cost01, cost23, cost45;
+            register UInt16 icv;
+            cost01 = cost23 = cost45 = 0;
+
+#           define BZ_ITER(nn)                \
+               icv = mtfv[gs+(nn)];           \
+               cost01 += s->len_pack[icv][0]; \
+               cost23 += s->len_pack[icv][1]; \
+               cost45 += s->len_pack[icv][2]; \
+
+            BZ_ITER(0);  BZ_ITER(1);  BZ_ITER(2);  BZ_ITER(3);  BZ_ITER(4);
+            BZ_ITER(5);  BZ_ITER(6);  BZ_ITER(7);  BZ_ITER(8);  BZ_ITER(9);
+            BZ_ITER(10); BZ_ITER(11); BZ_ITER(12); BZ_ITER(13); BZ_ITER(14);
+            BZ_ITER(15); BZ_ITER(16); BZ_ITER(17); BZ_ITER(18); BZ_ITER(19);
+            BZ_ITER(20); BZ_ITER(21); BZ_ITER(22); BZ_ITER(23); BZ_ITER(24);
+            BZ_ITER(25); BZ_ITER(26); BZ_ITER(27); BZ_ITER(28); BZ_ITER(29);
+            BZ_ITER(30); BZ_ITER(31); BZ_ITER(32); BZ_ITER(33); BZ_ITER(34);
+            BZ_ITER(35); BZ_ITER(36); BZ_ITER(37); BZ_ITER(38); BZ_ITER(39);
+            BZ_ITER(40); BZ_ITER(41); BZ_ITER(42); BZ_ITER(43); BZ_ITER(44);
+            BZ_ITER(45); BZ_ITER(46); BZ_ITER(47); BZ_ITER(48); BZ_ITER(49);
+
+#           undef BZ_ITER
+
+            cost[0] = cost01 & 0xffff; cost[1] = cost01 >> 16;
+            cost[2] = cost23 & 0xffff; cost[3] = cost23 >> 16;
+            cost[4] = cost45 & 0xffff; cost[5] = cost45 >> 16;
+
+         } else {
+	    /*--- slow version which correctly handles all situations ---*/
+            for (i = gs; i <= ge; i++) { 
+               UInt16 icv = mtfv[i];
+               for (t = 0; t < nGroups; t++) cost[t] += s->len[t][icv];
+            }
+         }
+ 
+         /*-- 
+            Find the coding table which is best for this group,
+            and record its identity in the selector table.
+         --*/
+         bc = 999999999; bt = -1;
+         for (t = 0; t < nGroups; t++)
+            if (cost[t] < bc) { bc = cost[t]; bt = t; };
+         totc += bc;
+         fave[bt]++;
+         s->selector[nSelectors] = bt;
+         nSelectors++;
+
+         /*-- 
+            Increment the symbol frequencies for the selected table.
+          --*/
+         if (nGroups == 6 && 50 == ge-gs+1) {
+            /*--- fast track the common case ---*/
+
+#           define BZ_ITUR(nn) s->rfreq[bt][ mtfv[gs+(nn)] ]++
+
+            BZ_ITUR(0);  BZ_ITUR(1);  BZ_ITUR(2);  BZ_ITUR(3);  BZ_ITUR(4);
+            BZ_ITUR(5);  BZ_ITUR(6);  BZ_ITUR(7);  BZ_ITUR(8);  BZ_ITUR(9);
+            BZ_ITUR(10); BZ_ITUR(11); BZ_ITUR(12); BZ_ITUR(13); BZ_ITUR(14);
+            BZ_ITUR(15); BZ_ITUR(16); BZ_ITUR(17); BZ_ITUR(18); BZ_ITUR(19);
+            BZ_ITUR(20); BZ_ITUR(21); BZ_ITUR(22); BZ_ITUR(23); BZ_ITUR(24);
+            BZ_ITUR(25); BZ_ITUR(26); BZ_ITUR(27); BZ_ITUR(28); BZ_ITUR(29);
+            BZ_ITUR(30); BZ_ITUR(31); BZ_ITUR(32); BZ_ITUR(33); BZ_ITUR(34);
+            BZ_ITUR(35); BZ_ITUR(36); BZ_ITUR(37); BZ_ITUR(38); BZ_ITUR(39);
+            BZ_ITUR(40); BZ_ITUR(41); BZ_ITUR(42); BZ_ITUR(43); BZ_ITUR(44);
+            BZ_ITUR(45); BZ_ITUR(46); BZ_ITUR(47); BZ_ITUR(48); BZ_ITUR(49);
+
+#           undef BZ_ITUR
+
+         } else {
+	    /*--- slow version which correctly handles all situations ---*/
+            for (i = gs; i <= ge; i++)
+               s->rfreq[bt][ mtfv[i] ]++;
+         }
+
+         gs = ge+1;
+      }
+      if (s->verbosity >= 3) {
+         VPrintf2 ( "      pass %d: size is %d, grp uses are ", 
+                   iter+1, totc/8 );
+         for (t = 0; t < nGroups; t++)
+            VPrintf1 ( "%d ", fave[t] );
+         VPrintf0 ( "\n" );
+      }
+
+      /*--
+        Recompute the tables based on the accumulated frequencies.
+      --*/
+      /* maxLen was changed from 20 to 17 in bzip2-1.0.3.  See 
+         comment in huffman.c for details. */
+      for (t = 0; t < nGroups; t++)
+         BZ2_hbMakeCodeLengths ( &(s->len[t][0]), &(s->rfreq[t][0]), 
+                                 alphaSize, 17 /*20*/ );
+   }
+
+
+   AssertH( nGroups < 8, 3002 );
+   AssertH( nSelectors < 32768 &&
+            nSelectors <= (2 + (900000 / BZ_G_SIZE)),
+            3003 );
+
+
+   /*--- Compute MTF values for the selectors. ---*/
+   {
+      UChar pos[BZ_N_GROUPS], ll_i, tmp2, tmp;
+      for (i = 0; i < nGroups; i++) pos[i] = i;
+      for (i = 0; i < nSelectors; i++) {
+         ll_i = s->selector[i];
+         j = 0;
+         tmp = pos[j];
+         while ( ll_i != tmp ) {
+            j++;
+            tmp2 = tmp;
+            tmp = pos[j];
+            pos[j] = tmp2;
+         };
+         pos[0] = tmp;
+         s->selectorMtf[i] = j;
+      }
+   };
+
+   /*--- Assign actual codes for the tables. --*/
+   for (t = 0; t < nGroups; t++) {
+      minLen = 32;
+      maxLen = 0;
+      for (i = 0; i < alphaSize; i++) {
+         if (s->len[t][i] > maxLen) maxLen = s->len[t][i];
+         if (s->len[t][i] < minLen) minLen = s->len[t][i];
+      }
+      AssertH ( !(maxLen > 17 /*20*/ ), 3004 );
+      AssertH ( !(minLen < 1),  3005 );
+      BZ2_hbAssignCodes ( &(s->code[t][0]), &(s->len[t][0]), 
+                          minLen, maxLen, alphaSize );
+   }
+
+   /*--- Transmit the mapping table. ---*/
+   { 
+      Bool inUse16[16];
+      for (i = 0; i < 16; i++) {
+          inUse16[i] = False;
+          for (j = 0; j < 16; j++)
+             if (s->inUse[i * 16 + j]) inUse16[i] = True;
+      }
+     
+      nBytes = s->numZ;
+      for (i = 0; i < 16; i++)
+         if (inUse16[i]) bsW(s,1,1); else bsW(s,1,0);
+
+      for (i = 0; i < 16; i++)
+         if (inUse16[i])
+            for (j = 0; j < 16; j++) {
+               if (s->inUse[i * 16 + j]) bsW(s,1,1); else bsW(s,1,0);
+            }
+
+      if (s->verbosity >= 3) 
+         VPrintf1( "      bytes: mapping %d, ", s->numZ-nBytes );
+   }
+
+   /*--- Now the selectors. ---*/
+   nBytes = s->numZ;
+   bsW ( s, 3, nGroups );
+   bsW ( s, 15, nSelectors );
+   for (i = 0; i < nSelectors; i++) { 
+      for (j = 0; j < s->selectorMtf[i]; j++) bsW(s,1,1);
+      bsW(s,1,0);
+   }
+   if (s->verbosity >= 3)
+      VPrintf1( "selectors %d, ", s->numZ-nBytes );
+
+   /*--- Now the coding tables. ---*/
+   nBytes = s->numZ;
+
+   for (t = 0; t < nGroups; t++) {
+      Int32 curr = s->len[t][0];
+      bsW ( s, 5, curr );
+      for (i = 0; i < alphaSize; i++) {
+         while (curr < s->len[t][i]) { bsW(s,2,2); curr++; /* 10 */ };
+         while (curr > s->len[t][i]) { bsW(s,2,3); curr--; /* 11 */ };
+         bsW ( s, 1, 0 );
+      }
+   }
+
+   if (s->verbosity >= 3)
+      VPrintf1 ( "code lengths %d, ", s->numZ-nBytes );
+
+   /*--- And finally, the block data proper ---*/
+   nBytes = s->numZ;
+   selCtr = 0;
+   gs = 0;
+   while (True) {
+      if (gs >= s->nMTF) break;
+      ge = gs + BZ_G_SIZE - 1; 
+      if (ge >= s->nMTF) ge = s->nMTF-1;
+      AssertH ( s->selector[selCtr] < nGroups, 3006 );
+
+      if (nGroups == 6 && 50 == ge-gs+1) {
+            /*--- fast track the common case ---*/
+            UInt16 mtfv_i;
+            UChar* s_len_sel_selCtr 
+               = &(s->len[s->selector[selCtr]][0]);
+            Int32* s_code_sel_selCtr
+               = &(s->code[s->selector[selCtr]][0]);
+
+#           define BZ_ITAH(nn)                      \
+               mtfv_i = mtfv[gs+(nn)];              \
+               bsW ( s,                             \
+                     s_len_sel_selCtr[mtfv_i],      \
+                     s_code_sel_selCtr[mtfv_i] )
+
+            BZ_ITAH(0);  BZ_ITAH(1);  BZ_ITAH(2);  BZ_ITAH(3);  BZ_ITAH(4);
+            BZ_ITAH(5);  BZ_ITAH(6);  BZ_ITAH(7);  BZ_ITAH(8);  BZ_ITAH(9);
+            BZ_ITAH(10); BZ_ITAH(11); BZ_ITAH(12); BZ_ITAH(13); BZ_ITAH(14);
+            BZ_ITAH(15); BZ_ITAH(16); BZ_ITAH(17); BZ_ITAH(18); BZ_ITAH(19);
+            BZ_ITAH(20); BZ_ITAH(21); BZ_ITAH(22); BZ_ITAH(23); BZ_ITAH(24);
+            BZ_ITAH(25); BZ_ITAH(26); BZ_ITAH(27); BZ_ITAH(28); BZ_ITAH(29);
+            BZ_ITAH(30); BZ_ITAH(31); BZ_ITAH(32); BZ_ITAH(33); BZ_ITAH(34);
+            BZ_ITAH(35); BZ_ITAH(36); BZ_ITAH(37); BZ_ITAH(38); BZ_ITAH(39);
+            BZ_ITAH(40); BZ_ITAH(41); BZ_ITAH(42); BZ_ITAH(43); BZ_ITAH(44);
+            BZ_ITAH(45); BZ_ITAH(46); BZ_ITAH(47); BZ_ITAH(48); BZ_ITAH(49);
+
+#           undef BZ_ITAH
+
+      } else {
+	 /*--- slow version which correctly handles all situations ---*/
+         for (i = gs; i <= ge; i++) {
+            bsW ( s, 
+                  s->len  [s->selector[selCtr]] [mtfv[i]],
+                  s->code [s->selector[selCtr]] [mtfv[i]] );
+         }
+      }
+
+
+      gs = ge+1;
+      selCtr++;
+   }
+   AssertH( selCtr == nSelectors, 3007 );
+
+   if (s->verbosity >= 3)
+      VPrintf1( "codes %d\n", s->numZ-nBytes );
+}
+
+
+/*---------------------------------------------------*/
+void BZ2_compressBlock ( EState* s, Bool is_last_block )
+{
+   if (s->nblock > 0) {
+
+      BZ_FINALISE_CRC ( s->blockCRC );
+      s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31);
+      s->combinedCRC ^= s->blockCRC;
+      if (s->blockNo > 1) s->numZ = 0;
+
+      if (s->verbosity >= 2)
+         VPrintf4( "    block %d: crc = 0x%08x, "
+                   "combined CRC = 0x%08x, size = %d\n",
+                   s->blockNo, s->blockCRC, s->combinedCRC, s->nblock );
+
+      BZ2_blockSort ( s );
+   }
+
+   s->zbits = (UChar*) (&((UChar*)s->arr2)[s->nblock]);
+
+   /*-- If this is the first block, create the stream header. --*/
+   if (s->blockNo == 1) {
+      BZ2_bsInitWrite ( s );
+      bsPutUChar ( s, BZ_HDR_B );
+      bsPutUChar ( s, BZ_HDR_Z );
+      bsPutUChar ( s, BZ_HDR_h );
+      bsPutUChar ( s, (UChar)(BZ_HDR_0 + s->blockSize100k) );
+   }
+
+   if (s->nblock > 0) {
+
+      bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 );
+      bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 );
+      bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 );
+
+      /*-- Now the block's CRC, so it is in a known place. --*/
+      bsPutUInt32 ( s, s->blockCRC );
+
+      /*-- 
+         Now a single bit indicating (non-)randomisation. 
+         As of version 0.9.5, we use a better sorting algorithm
+         which makes randomisation unnecessary.  So always set
+         the randomised bit to 'no'.  Of course, the decoder
+         still needs to be able to handle randomised blocks
+         so as to maintain backwards compatibility with
+         older versions of bzip2.
+      --*/
+      bsW(s,1,0);
+
+      bsW ( s, 24, s->origPtr );
+      generateMTFValues ( s );
+      sendMTFValues ( s );
+   }
+
+
+   /*-- If this is the last block, add the stream trailer. --*/
+   if (is_last_block) {
+
+      bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 );
+      bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 );
+      bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 );
+      bsPutUInt32 ( s, s->combinedCRC );
+      if (s->verbosity >= 2)
+         VPrintf1( "    final combined CRC = 0x%08x\n   ", s->combinedCRC );
+      bsFinishWrite ( s );
+   }
+}
+
+
+/*-------------------------------------------------------------*/
+/*--- end                                        compress.c ---*/
+/*-------------------------------------------------------------*/
+
+
+/*-------------------------------------------------------------*/
+/*--- Table for randomising repetitive blocks               ---*/
+/*---                                           randtable.c ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+--*/
+
+
+
+
+/*---------------------------------------------*/
+Int32 BZ2_rNums[512] = { 
+   619, 720, 127, 481, 931, 816, 813, 233, 566, 247, 
+   985, 724, 205, 454, 863, 491, 741, 242, 949, 214, 
+   733, 859, 335, 708, 621, 574, 73, 654, 730, 472, 
+   419, 436, 278, 496, 867, 210, 399, 680, 480, 51, 
+   878, 465, 811, 169, 869, 675, 611, 697, 867, 561, 
+   862, 687, 507, 283, 482, 129, 807, 591, 733, 623, 
+   150, 238, 59, 379, 684, 877, 625, 169, 643, 105, 
+   170, 607, 520, 932, 727, 476, 693, 425, 174, 647, 
+   73, 122, 335, 530, 442, 853, 695, 249, 445, 515, 
+   909, 545, 703, 919, 874, 474, 882, 500, 594, 612, 
+   641, 801, 220, 162, 819, 984, 589, 513, 495, 799, 
+   161, 604, 958, 533, 221, 400, 386, 867, 600, 782, 
+   382, 596, 414, 171, 516, 375, 682, 485, 911, 276, 
+   98, 553, 163, 354, 666, 933, 424, 341, 533, 870, 
+   227, 730, 475, 186, 263, 647, 537, 686, 600, 224, 
+   469, 68, 770, 919, 190, 373, 294, 822, 808, 206, 
+   184, 943, 795, 384, 383, 461, 404, 758, 839, 887, 
+   715, 67, 618, 276, 204, 918, 873, 777, 604, 560, 
+   951, 160, 578, 722, 79, 804, 96, 409, 713, 940, 
+   652, 934, 970, 447, 318, 353, 859, 672, 112, 785, 
+   645, 863, 803, 350, 139, 93, 354, 99, 820, 908, 
+   609, 772, 154, 274, 580, 184, 79, 626, 630, 742, 
+   653, 282, 762, 623, 680, 81, 927, 626, 789, 125, 
+   411, 521, 938, 300, 821, 78, 343, 175, 128, 250, 
+   170, 774, 972, 275, 999, 639, 495, 78, 352, 126, 
+   857, 956, 358, 619, 580, 124, 737, 594, 701, 612, 
+   669, 112, 134, 694, 363, 992, 809, 743, 168, 974, 
+   944, 375, 748, 52, 600, 747, 642, 182, 862, 81, 
+   344, 805, 988, 739, 511, 655, 814, 334, 249, 515, 
+   897, 955, 664, 981, 649, 113, 974, 459, 893, 228, 
+   433, 837, 553, 268, 926, 240, 102, 654, 459, 51, 
+   686, 754, 806, 760, 493, 403, 415, 394, 687, 700, 
+   946, 670, 656, 610, 738, 392, 760, 799, 887, 653, 
+   978, 321, 576, 617, 626, 502, 894, 679, 243, 440, 
+   680, 879, 194, 572, 640, 724, 926, 56, 204, 700, 
+   707, 151, 457, 449, 797, 195, 791, 558, 945, 679, 
+   297, 59, 87, 824, 713, 663, 412, 693, 342, 606, 
+   134, 108, 571, 364, 631, 212, 174, 643, 304, 329, 
+   343, 97, 430, 751, 497, 314, 983, 374, 822, 928, 
+   140, 206, 73, 263, 980, 736, 876, 478, 430, 305, 
+   170, 514, 364, 692, 829, 82, 855, 953, 676, 246, 
+   369, 970, 294, 750, 807, 827, 150, 790, 288, 923, 
+   804, 378, 215, 828, 592, 281, 565, 555, 710, 82, 
+   896, 831, 547, 261, 524, 462, 293, 465, 502, 56, 
+   661, 821, 976, 991, 658, 869, 905, 758, 745, 193, 
+   768, 550, 608, 933, 378, 286, 215, 979, 792, 961, 
+   61, 688, 793, 644, 986, 403, 106, 366, 905, 644, 
+   372, 567, 466, 434, 645, 210, 389, 550, 919, 135, 
+   780, 773, 635, 389, 707, 100, 626, 958, 165, 504, 
+   920, 176, 193, 713, 857, 265, 203, 50, 668, 108, 
+   645, 990, 626, 197, 510, 357, 358, 850, 858, 364, 
+   936, 638
+};
+
+
+/*-------------------------------------------------------------*/
+/*--- end                                       randtable.c ---*/
+/*-------------------------------------------------------------*/
+
+/*-------------------------------------------------------------*/
+/*--- Table for doing CRCs                                  ---*/
+/*---                                            crctable.c ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+--*/
+
+
+
+
+
+/*--
+  I think this is an implementation of the AUTODIN-II,
+  Ethernet & FDDI 32-bit CRC standard.  Vaguely derived
+  from code by Rob Warnock, in Section 51 of the
+  comp.compression FAQ.
+--*/
+
+UInt32 BZ2_crc32Table[256] = {
+
+   /*-- Ugly, innit? --*/
+
+   0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L,
+   0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L,
+   0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L,
+   0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL,
+   0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L,
+   0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L,
+   0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L,
+   0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL,
+   0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L,
+   0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L,
+   0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L,
+   0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL,
+   0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L,
+   0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L,
+   0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L,
+   0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL,
+   0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL,
+   0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L,
+   0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L,
+   0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL,
+   0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL,
+   0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L,
+   0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L,
+   0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL,
+   0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL,
+   0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L,
+   0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L,
+   0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL,
+   0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL,
+   0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L,
+   0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L,
+   0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL,
+   0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L,
+   0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL,
+   0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL,
+   0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L,
+   0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L,
+   0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL,
+   0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL,
+   0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L,
+   0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L,
+   0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL,
+   0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL,
+   0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L,
+   0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L,
+   0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL,
+   0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL,
+   0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L,
+   0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L,
+   0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL,
+   0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L,
+   0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L,
+   0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L,
+   0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL,
+   0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L,
+   0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L,
+   0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L,
+   0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL,
+   0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L,
+   0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L,
+   0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L,
+   0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL,
+   0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L,
+   0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L
+};
+
+
+/*-------------------------------------------------------------*/
+/*--- end                                        crctable.c ---*/
+/*-------------------------------------------------------------*/
+
+/*-------------------------------------------------------------*/
+/*--- Library top-level functions.                          ---*/
+/*---                                               bzlib.c ---*/
+/*-------------------------------------------------------------*/
+
+/*--
+  This file is a part of bzip2 and/or libbzip2, a program and
+  library for lossless, block-sorting data compression.
+
+  Copyright (C) 1996-2004 Julian R Seward.  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+  2. The origin of this software must not be misrepresented; you must 
+     not claim that you wrote the original software.  If you use this 
+     software in a product, an acknowledgment in the product 
+     documentation would be appreciated but is not required.
+
+  3. Altered source versions must be plainly marked as such, and must
+     not be misrepresented as being the original software.
+
+  4. The name of the author may not be used to endorse or promote 
+     products derived from this software without specific prior written 
+     permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  Julian Seward, Cambridge, UK.
+  jseward@bzip.org
+  bzip2/libbzip2 version 1.0 of 21 March 2000
+
+  This program is based on (at least) the work of:
+     Mike Burrows
+     David Wheeler
+     Peter Fenwick
+     Alistair Moffat
+     Radford Neal
+     Ian H. Witten
+     Robert Sedgewick
+     Jon L. Bentley
+
+  For more information on these sources, see the manual.
+--*/
+
+/*--
+   CHANGES
+   ~~~~~~~
+   0.9.0 -- original version.
+
+   0.9.0a/b -- no changes in this file.
+
+   0.9.0c
+      * made zero-length BZ_FLUSH work correctly in bzCompress().
+      * fixed bzWrite/bzRead to ignore zero-length requests.
+      * fixed bzread to correctly handle read requests after EOF.
+      * wrong parameter order in call to bzDecompressInit in
+        bzBuffToBuffDecompress.  Fixed.
+--*/
+
+
+
+/*---------------------------------------------------*/
+/*--- Compression stuff                           ---*/
+/*---------------------------------------------------*/
+
+
+/*---------------------------------------------------*/
+void BZ2_bz__AssertH__fail ( int errcode )
+{
+   vex_printf("BZ2_bz__AssertH__fail(%d) called, exiting\n", errcode);
+   (*serviceFn)(0,0);
+}
+
+void bz_internal_error ( int errcode )
+{
+   vex_printf("bz_internal_error called, exiting\n", errcode);
+   (*serviceFn)(0,0);
+}
+
+/*---------------------------------------------------*/
+static
+int bz_config_ok ( void )
+{
+   if (sizeof(int)   != 4) return 0;
+   if (sizeof(short) != 2) return 0;
+   if (sizeof(char)  != 1) return 0;
+   return 1;
+}
+
+
+/*---------------------------------------------------*/
+static
+void* default_bzalloc ( void* opaque, Int32 items, Int32 size )
+{
+   void* v = (void*) (*serviceFn)(2, items * size );
+   return v;
+}
+
+static
+void default_bzfree ( void* opaque, void* addr )
+{
+   if (addr != NULL) (*serviceFn)( 3, (HWord)addr );
+}
+
+
+/*---------------------------------------------------*/
+static
+void prepare_new_block ( EState* s )
+{
+   Int32 i;
+   s->nblock = 0;
+   s->numZ = 0;
+   s->state_out_pos = 0;
+   BZ_INITIALISE_CRC ( s->blockCRC );
+   for (i = 0; i < 256; i++) s->inUse[i] = False;
+   s->blockNo++;
+}
+
+
+/*---------------------------------------------------*/
+static
+void init_RL ( EState* s )
+{
+   s->state_in_ch  = 256;
+   s->state_in_len = 0;
+}
+
+
+static
+Bool isempty_RL ( EState* s )
+{
+   if (s->state_in_ch < 256 && s->state_in_len > 0)
+      return False; else
+      return True;
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzCompressInit) 
+                    ( bz_stream* strm, 
+                     int        blockSize100k,
+                     int        verbosity,
+                     int        workFactor )
+{
+   Int32   n;
+   EState* s;
+
+   if (!bz_config_ok()) return BZ_CONFIG_ERROR;
+
+   if (strm == NULL || 
+       blockSize100k < 1 || blockSize100k > 9 ||
+       workFactor < 0 || workFactor > 250)
+     return BZ_PARAM_ERROR;
+
+   if (workFactor == 0) workFactor = 30;
+   if (strm->bzalloc == NULL) strm->bzalloc = default_bzalloc;
+   if (strm->bzfree == NULL) strm->bzfree = default_bzfree;
+
+   s = BZALLOC( sizeof(EState) );
+   if (s == NULL) return BZ_MEM_ERROR;
+   s->strm = strm;
+
+   s->arr1 = NULL;
+   s->arr2 = NULL;
+   s->ftab = NULL;
+
+   n       = 100000 * blockSize100k;
+   s->arr1 = BZALLOC( n                  * sizeof(UInt32) );
+   s->arr2 = BZALLOC( (n+BZ_N_OVERSHOOT) * sizeof(UInt32) );
+   s->ftab = BZALLOC( 65537              * sizeof(UInt32) );
+
+   if (s->arr1 == NULL || s->arr2 == NULL || s->ftab == NULL) {
+      if (s->arr1 != NULL) BZFREE(s->arr1);
+      if (s->arr2 != NULL) BZFREE(s->arr2);
+      if (s->ftab != NULL) BZFREE(s->ftab);
+      if (s       != NULL) BZFREE(s);
+      return BZ_MEM_ERROR;
+   }
+
+   s->blockNo           = 0;
+   s->state             = BZ_S_INPUT;
+   s->mode              = BZ_M_RUNNING;
+   s->combinedCRC       = 0;
+   s->blockSize100k     = blockSize100k;
+   s->nblockMAX         = 100000 * blockSize100k - 19;
+   s->verbosity         = verbosity;
+   s->workFactor        = workFactor;
+
+   s->block             = (UChar*)s->arr2;
+   s->mtfv              = (UInt16*)s->arr1;
+   s->zbits             = NULL;
+   s->ptr               = (UInt32*)s->arr1;
+
+   strm->state          = s;
+   strm->total_in_lo32  = 0;
+   strm->total_in_hi32  = 0;
+   strm->total_out_lo32 = 0;
+   strm->total_out_hi32 = 0;
+   init_RL ( s );
+   prepare_new_block ( s );
+   return BZ_OK;
+}
+
+
+/*---------------------------------------------------*/
+static
+void add_pair_to_block ( EState* s )
+{
+   Int32 i;
+   UChar ch = (UChar)(s->state_in_ch);
+   for (i = 0; i < s->state_in_len; i++) {
+      BZ_UPDATE_CRC( s->blockCRC, ch );
+   }
+   s->inUse[s->state_in_ch] = True;
+   switch (s->state_in_len) {
+      case 1:
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         break;
+      case 2:
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         break;
+      case 3:
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         break;
+      default:
+         s->inUse[s->state_in_len-4] = True;
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         s->block[s->nblock] = (UChar)ch; s->nblock++;
+         s->block[s->nblock] = ((UChar)(s->state_in_len-4));
+         s->nblock++;
+         break;
+   }
+}
+
+
+/*---------------------------------------------------*/
+static
+void flush_RL ( EState* s )
+{
+   if (s->state_in_ch < 256) add_pair_to_block ( s );
+   init_RL ( s );
+}
+
+
+/*---------------------------------------------------*/
+#define ADD_CHAR_TO_BLOCK(zs,zchh0)               \
+{                                                 \
+   UInt32 zchh = (UInt32)(zchh0);                 \
+   /*-- fast track the common case --*/           \
+   if (zchh != zs->state_in_ch &&                 \
+       zs->state_in_len == 1) {                   \
+      UChar ch = (UChar)(zs->state_in_ch);        \
+      BZ_UPDATE_CRC( zs->blockCRC, ch );          \
+      zs->inUse[zs->state_in_ch] = True;          \
+      zs->block[zs->nblock] = (UChar)ch;          \
+      zs->nblock++;                               \
+      zs->state_in_ch = zchh;                     \
+   }                                              \
+   else                                           \
+   /*-- general, uncommon cases --*/              \
+   if (zchh != zs->state_in_ch ||                 \
+      zs->state_in_len == 255) {                  \
+      if (zs->state_in_ch < 256)                  \
+         add_pair_to_block ( zs );                \
+      zs->state_in_ch = zchh;                     \
+      zs->state_in_len = 1;                       \
+   } else {                                       \
+      zs->state_in_len++;                         \
+   }                                              \
+}
+
+
+/*---------------------------------------------------*/
+static
+Bool copy_input_until_stop ( EState* s )
+{
+   Bool progress_in = False;
+
+   if (s->mode == BZ_M_RUNNING) {
+
+      /*-- fast track the common case --*/
+      while (True) {
+         /*-- block full? --*/
+         if (s->nblock >= s->nblockMAX) break;
+         /*-- no input? --*/
+         if (s->strm->avail_in == 0) break;
+         progress_in = True;
+         ADD_CHAR_TO_BLOCK ( s, (UInt32)(*((UChar*)(s->strm->next_in))) ); 
+         s->strm->next_in++;
+         s->strm->avail_in--;
+         s->strm->total_in_lo32++;
+         if (s->strm->total_in_lo32 == 0) s->strm->total_in_hi32++;
+      }
+
+   } else {
+
+      /*-- general, uncommon case --*/
+      while (True) {
+         /*-- block full? --*/
+         if (s->nblock >= s->nblockMAX) break;
+         /*-- no input? --*/
+         if (s->strm->avail_in == 0) break;
+         /*-- flush/finish end? --*/
+         if (s->avail_in_expect == 0) break;
+         progress_in = True;
+         ADD_CHAR_TO_BLOCK ( s, (UInt32)(*((UChar*)(s->strm->next_in))) ); 
+         s->strm->next_in++;
+         s->strm->avail_in--;
+         s->strm->total_in_lo32++;
+         if (s->strm->total_in_lo32 == 0) s->strm->total_in_hi32++;
+         s->avail_in_expect--;
+      }
+      //exit(1);
+   }
+   return progress_in;
+}
+
+
+/*---------------------------------------------------*/
+static
+Bool copy_output_until_stop ( EState* s )
+{
+   Bool progress_out = False;
+
+   while (True) {
+
+      /*-- no output space? --*/
+      if (s->strm->avail_out == 0) break;
+
+      /*-- block done? --*/
+      if (s->state_out_pos >= s->numZ) break;
+
+      progress_out = True;
+      *(s->strm->next_out) = s->zbits[s->state_out_pos];
+      s->state_out_pos++;
+      s->strm->avail_out--;
+      s->strm->next_out++;
+      s->strm->total_out_lo32++;
+      if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++;
+   }
+
+   return progress_out;
+}
+
+
+/*---------------------------------------------------*/
+static
+Bool handle_compress ( bz_stream* strm )
+{
+   Bool progress_in  = False;
+   Bool progress_out = False;
+   EState* s = strm->state;
+   
+   while (True) {
+
+      if (s->state == BZ_S_OUTPUT) {
+         progress_out |= copy_output_until_stop ( s );
+         if (s->state_out_pos < s->numZ) break;
+         if (s->mode == BZ_M_FINISHING && 
+             s->avail_in_expect == 0 &&
+             isempty_RL(s)) break;
+         prepare_new_block ( s );
+         s->state = BZ_S_INPUT;
+         if (s->mode == BZ_M_FLUSHING && 
+             s->avail_in_expect == 0 &&
+             isempty_RL(s)) break;
+      }
+
+      if (s->state == BZ_S_INPUT) {
+         progress_in |= copy_input_until_stop ( s );
+         if (s->mode != BZ_M_RUNNING && s->avail_in_expect == 0) {
+            flush_RL ( s );
+            BZ2_compressBlock ( s, (Bool)(s->mode == BZ_M_FINISHING) );
+            s->state = BZ_S_OUTPUT;
+         }
+         else
+         if (s->nblock >= s->nblockMAX) {
+            BZ2_compressBlock ( s, False );
+            s->state = BZ_S_OUTPUT;
+         }
+         else
+         if (s->strm->avail_in == 0) {
+            break;
+         }
+      }
+
+   }
+
+   return progress_in || progress_out;
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzCompress) ( bz_stream *strm, int action )
+{
+   Bool progress;
+   EState* s;
+   if (strm == NULL) return BZ_PARAM_ERROR;
+   s = strm->state;
+   if (s == NULL) return BZ_PARAM_ERROR;
+   if (s->strm != strm) return BZ_PARAM_ERROR;
+
+   preswitch:
+   switch (s->mode) {
+
+      case BZ_M_IDLE:
+         return BZ_SEQUENCE_ERROR;
+      case BZ_M_RUNNING:
+         if (action == BZ_RUN) {
+            progress = handle_compress ( strm );
+            return progress ? BZ_RUN_OK : BZ_PARAM_ERROR;
+         } 
+         else
+	 if (action == BZ_FLUSH) {
+            s->avail_in_expect = strm->avail_in;
+            s->mode = BZ_M_FLUSHING;
+            goto preswitch;
+         }
+         else
+         if (action == BZ_FINISH) {
+            s->avail_in_expect = strm->avail_in;
+            s->mode = BZ_M_FINISHING;
+            goto preswitch;
+         }
+         else 
+            return BZ_PARAM_ERROR;
+
+      case BZ_M_FLUSHING:
+         if (action != BZ_FLUSH) return BZ_SEQUENCE_ERROR;
+         if (s->avail_in_expect != s->strm->avail_in) 
+            return BZ_SEQUENCE_ERROR;
+         progress = handle_compress ( strm );
+         if (s->avail_in_expect > 0 || !isempty_RL(s) ||
+             s->state_out_pos < s->numZ) return BZ_FLUSH_OK;
+         s->mode = BZ_M_RUNNING;
+         return BZ_RUN_OK;
+
+      case BZ_M_FINISHING:
+         if (action != BZ_FINISH) return BZ_SEQUENCE_ERROR;
+         if (s->avail_in_expect != s->strm->avail_in) 
+            return BZ_SEQUENCE_ERROR;
+         progress = handle_compress ( strm );
+         if (!progress) return BZ_SEQUENCE_ERROR;
+         if (s->avail_in_expect > 0 || !isempty_RL(s) ||
+             s->state_out_pos < s->numZ) return BZ_FINISH_OK;
+         s->mode = BZ_M_IDLE;
+         return BZ_STREAM_END;
+   }
+   return BZ_OK; /*--not reached--*/
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzCompressEnd)  ( bz_stream *strm )
+{
+   EState* s;
+   if (strm == NULL) return BZ_PARAM_ERROR;
+   s = strm->state;
+   if (s == NULL) return BZ_PARAM_ERROR;
+   if (s->strm != strm) return BZ_PARAM_ERROR;
+
+   if (s->arr1 != NULL) BZFREE(s->arr1);
+   if (s->arr2 != NULL) BZFREE(s->arr2);
+   if (s->ftab != NULL) BZFREE(s->ftab);
+   BZFREE(strm->state);
+
+   strm->state = NULL;   
+
+   return BZ_OK;
+}
+
+
+/*---------------------------------------------------*/
+/*--- Decompression stuff                         ---*/
+/*---------------------------------------------------*/
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzDecompressInit) 
+                     ( bz_stream* strm, 
+                       int        verbosity,
+                       int        small )
+{
+   DState* s;
+
+   if (!bz_config_ok()) return BZ_CONFIG_ERROR;
+
+   if (strm == NULL) return BZ_PARAM_ERROR;
+   if (small != 0 && small != 1) return BZ_PARAM_ERROR;
+   if (verbosity < 0 || verbosity > 4) return BZ_PARAM_ERROR;
+
+   if (strm->bzalloc == NULL) strm->bzalloc = default_bzalloc;
+   if (strm->bzfree == NULL) strm->bzfree = default_bzfree;
+
+   s = BZALLOC( sizeof(DState) );
+   if (s == NULL) return BZ_MEM_ERROR;
+   s->strm                  = strm;
+   strm->state              = s;
+   s->state                 = BZ_X_MAGIC_1;
+   s->bsLive                = 0;
+   s->bsBuff                = 0;
+   s->calculatedCombinedCRC = 0;
+   strm->total_in_lo32      = 0;
+   strm->total_in_hi32      = 0;
+   strm->total_out_lo32     = 0;
+   strm->total_out_hi32     = 0;
+   s->smallDecompress       = (Bool)small;
+   s->ll4                   = NULL;
+   s->ll16                  = NULL;
+   s->tt                    = NULL;
+   s->currBlockNo           = 0;
+   s->verbosity             = verbosity;
+
+   return BZ_OK;
+}
+
+
+/*---------------------------------------------------*/
+/* Return  True iff data corruption is discovered.
+   Returns False if there is no problem.
+*/
+static
+Bool unRLE_obuf_to_output_FAST ( DState* s )
+{
+   UChar k1;
+
+   if (s->blockRandomised) {
+
+      while (True) {
+         /* try to finish existing run */
+         while (True) {
+            if (s->strm->avail_out == 0) return False;
+            if (s->state_out_len == 0) break;
+            *( (UChar*)(s->strm->next_out) ) = s->state_out_ch;
+            BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch );
+            s->state_out_len--;
+            s->strm->next_out++;
+            s->strm->avail_out--;
+            s->strm->total_out_lo32++;
+            if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++;
+         }
+
+         /* can a new run be started? */
+         if (s->nblock_used == s->save_nblock+1) return False;
+               
+         /* Only caused by corrupt data stream? */
+         if (s->nblock_used > s->save_nblock+1)
+            return True;
+   
+         s->state_out_len = 1;
+         s->state_out_ch = s->k0;
+         BZ_GET_FAST(k1); BZ_RAND_UPD_MASK; 
+         k1 ^= BZ_RAND_MASK; s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         s->state_out_len = 2;
+         BZ_GET_FAST(k1); BZ_RAND_UPD_MASK; 
+         k1 ^= BZ_RAND_MASK; s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         s->state_out_len = 3;
+         BZ_GET_FAST(k1); BZ_RAND_UPD_MASK; 
+         k1 ^= BZ_RAND_MASK; s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         BZ_GET_FAST(k1); BZ_RAND_UPD_MASK; 
+         k1 ^= BZ_RAND_MASK; s->nblock_used++;
+         s->state_out_len = ((Int32)k1) + 4;
+         BZ_GET_FAST(s->k0); BZ_RAND_UPD_MASK; 
+         s->k0 ^= BZ_RAND_MASK; s->nblock_used++;
+      }
+
+   } else {
+
+      /* restore */
+      UInt32        c_calculatedBlockCRC = s->calculatedBlockCRC;
+      UChar         c_state_out_ch       = s->state_out_ch;
+      Int32         c_state_out_len      = s->state_out_len;
+      Int32         c_nblock_used        = s->nblock_used;
+      Int32         c_k0                 = s->k0;
+      UInt32*       c_tt                 = s->tt;
+      UInt32        c_tPos               = s->tPos;
+      char*         cs_next_out          = s->strm->next_out;
+      unsigned int  cs_avail_out         = s->strm->avail_out;
+      /* end restore */
+
+      UInt32       avail_out_INIT = cs_avail_out;
+      Int32        s_save_nblockPP = s->save_nblock+1;
+      unsigned int total_out_lo32_old;
+
+      while (True) {
+
+         /* try to finish existing run */
+         if (c_state_out_len > 0) {
+            while (True) {
+               if (cs_avail_out == 0) goto return_notr;
+               if (c_state_out_len == 1) break;
+               *( (UChar*)(cs_next_out) ) = c_state_out_ch;
+               BZ_UPDATE_CRC ( c_calculatedBlockCRC, c_state_out_ch );
+               c_state_out_len--;
+               cs_next_out++;
+               cs_avail_out--;
+            }
+            s_state_out_len_eq_one:
+            {
+               if (cs_avail_out == 0) { 
+                  c_state_out_len = 1; goto return_notr;
+               };
+               *( (UChar*)(cs_next_out) ) = c_state_out_ch;
+               BZ_UPDATE_CRC ( c_calculatedBlockCRC, c_state_out_ch );
+               cs_next_out++;
+               cs_avail_out--;
+            }
+         }   
+         /* Only caused by corrupt data stream? */
+         if (c_nblock_used > s_save_nblockPP)
+            return True;
+
+         /* can a new run be started? */
+         if (c_nblock_used == s_save_nblockPP) {
+            c_state_out_len = 0; goto return_notr;
+         };   
+         c_state_out_ch = c_k0;
+         BZ_GET_FAST_C(k1); c_nblock_used++;
+         if (k1 != c_k0) { 
+            c_k0 = k1; goto s_state_out_len_eq_one; 
+         };
+         if (c_nblock_used == s_save_nblockPP) 
+            goto s_state_out_len_eq_one;
+   
+         c_state_out_len = 2;
+         BZ_GET_FAST_C(k1); c_nblock_used++;
+         if (c_nblock_used == s_save_nblockPP) continue;
+         if (k1 != c_k0) { c_k0 = k1; continue; };
+   
+         c_state_out_len = 3;
+         BZ_GET_FAST_C(k1); c_nblock_used++;
+         if (c_nblock_used == s_save_nblockPP) continue;
+         if (k1 != c_k0) { c_k0 = k1; continue; };
+   
+         BZ_GET_FAST_C(k1); c_nblock_used++;
+         c_state_out_len = ((Int32)k1) + 4;
+         BZ_GET_FAST_C(c_k0); c_nblock_used++;
+      }
+
+      return_notr:
+      total_out_lo32_old = s->strm->total_out_lo32;
+      s->strm->total_out_lo32 += (avail_out_INIT - cs_avail_out);
+      if (s->strm->total_out_lo32 < total_out_lo32_old)
+         s->strm->total_out_hi32++;
+
+      /* save */
+      s->calculatedBlockCRC = c_calculatedBlockCRC;
+      s->state_out_ch       = c_state_out_ch;
+      s->state_out_len      = c_state_out_len;
+      s->nblock_used        = c_nblock_used;
+      s->k0                 = c_k0;
+      s->tt                 = c_tt;
+      s->tPos               = c_tPos;
+      s->strm->next_out     = cs_next_out;
+      s->strm->avail_out    = cs_avail_out;
+      /* end save */
+   }
+   return False;
+}
+
+
+
+/*---------------------------------------------------*/
+/* Return  True iff data corruption is discovered.
+   Returns False if there is no problem.
+*/
+static
+Bool unRLE_obuf_to_output_SMALL ( DState* s )
+{
+   UChar k1;
+
+   if (s->blockRandomised) {
+
+      while (True) {
+         /* try to finish existing run */
+         while (True) {
+            if (s->strm->avail_out == 0) return False;
+            if (s->state_out_len == 0) break;
+            *( (UChar*)(s->strm->next_out) ) = s->state_out_ch;
+            BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch );
+            s->state_out_len--;
+            s->strm->next_out++;
+            s->strm->avail_out--;
+            s->strm->total_out_lo32++;
+            if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++;
+         }
+   
+         /* can a new run be started? */
+         if (s->nblock_used == s->save_nblock+1) return False;
+
+         /* Only caused by corrupt data stream? */
+         if (s->nblock_used > s->save_nblock+1)
+            return True;
+   
+         s->state_out_len = 1;
+         s->state_out_ch = s->k0;
+         BZ_GET_SMALL(k1); BZ_RAND_UPD_MASK; 
+         k1 ^= BZ_RAND_MASK; s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         s->state_out_len = 2;
+         BZ_GET_SMALL(k1); BZ_RAND_UPD_MASK; 
+         k1 ^= BZ_RAND_MASK; s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         s->state_out_len = 3;
+         BZ_GET_SMALL(k1); BZ_RAND_UPD_MASK; 
+         k1 ^= BZ_RAND_MASK; s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         BZ_GET_SMALL(k1); BZ_RAND_UPD_MASK; 
+         k1 ^= BZ_RAND_MASK; s->nblock_used++;
+         s->state_out_len = ((Int32)k1) + 4;
+         BZ_GET_SMALL(s->k0); BZ_RAND_UPD_MASK; 
+         s->k0 ^= BZ_RAND_MASK; s->nblock_used++;
+      }
+
+   } else {
+
+      while (True) {
+         /* try to finish existing run */
+         while (True) {
+            if (s->strm->avail_out == 0) return False;
+            if (s->state_out_len == 0) break;
+            *( (UChar*)(s->strm->next_out) ) = s->state_out_ch;
+            BZ_UPDATE_CRC ( s->calculatedBlockCRC, s->state_out_ch );
+            s->state_out_len--;
+            s->strm->next_out++;
+            s->strm->avail_out--;
+            s->strm->total_out_lo32++;
+            if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++;
+         }
+   
+         /* can a new run be started? */
+         if (s->nblock_used == s->save_nblock+1) return False;
+
+         /* Only caused by corrupt data stream? */
+         if (s->nblock_used > s->save_nblock+1)
+            return True;
+   
+         s->state_out_len = 1;
+         s->state_out_ch = s->k0;
+         BZ_GET_SMALL(k1); s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         s->state_out_len = 2;
+         BZ_GET_SMALL(k1); s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         s->state_out_len = 3;
+         BZ_GET_SMALL(k1); s->nblock_used++;
+         if (s->nblock_used == s->save_nblock+1) continue;
+         if (k1 != s->k0) { s->k0 = k1; continue; };
+   
+         BZ_GET_SMALL(k1); s->nblock_used++;
+         s->state_out_len = ((Int32)k1) + 4;
+         BZ_GET_SMALL(s->k0); s->nblock_used++;
+      }
+
+   }
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzDecompress) ( bz_stream *strm )
+{
+   Bool    corrupt;
+   DState* s;
+   if (strm == NULL) return BZ_PARAM_ERROR;
+   s = strm->state;
+   if (s == NULL) return BZ_PARAM_ERROR;
+   if (s->strm != strm) return BZ_PARAM_ERROR;
+
+   while (True) {
+      if (s->state == BZ_X_IDLE) return BZ_SEQUENCE_ERROR;
+      if (s->state == BZ_X_OUTPUT) {
+         if (s->smallDecompress)
+            corrupt = unRLE_obuf_to_output_SMALL ( s ); else
+            corrupt = unRLE_obuf_to_output_FAST  ( s );
+         if (corrupt) return BZ_DATA_ERROR;
+         if (s->nblock_used == s->save_nblock+1 && s->state_out_len == 0) {
+            BZ_FINALISE_CRC ( s->calculatedBlockCRC );
+            if (s->verbosity >= 3) 
+               VPrintf2 ( " {0x%08x, 0x%08x}", s->storedBlockCRC, 
+                          s->calculatedBlockCRC );
+            if (s->verbosity >= 2) VPrintf0 ( "]" );
+            if (s->calculatedBlockCRC != s->storedBlockCRC)
+               return BZ_DATA_ERROR;
+            s->calculatedCombinedCRC 
+               = (s->calculatedCombinedCRC << 1) | 
+                    (s->calculatedCombinedCRC >> 31);
+            s->calculatedCombinedCRC ^= s->calculatedBlockCRC;
+            s->state = BZ_X_BLKHDR_1;
+         } else {
+            return BZ_OK;
+         }
+      }
+      if (s->state >= BZ_X_MAGIC_1) {
+         Int32 r = BZ2_decompress ( s );
+         if (r == BZ_STREAM_END) {
+            if (s->verbosity >= 3)
+               VPrintf2 ( "\n    combined CRCs: stored = 0x%08x, computed = 0x%08x", 
+                          s->storedCombinedCRC, s->calculatedCombinedCRC );
+            if (s->calculatedCombinedCRC != s->storedCombinedCRC)
+               return BZ_DATA_ERROR;
+            return r;
+         }
+         if (s->state != BZ_X_OUTPUT) return r;
+      }
+   }
+
+   AssertH ( 0, 6001 );
+
+   return 0;  /*NOTREACHED*/
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzDecompressEnd)  ( bz_stream *strm )
+{
+   DState* s;
+   if (strm == NULL) return BZ_PARAM_ERROR;
+   s = strm->state;
+   if (s == NULL) return BZ_PARAM_ERROR;
+   if (s->strm != strm) return BZ_PARAM_ERROR;
+
+   if (s->tt   != NULL) BZFREE(s->tt);
+   if (s->ll16 != NULL) BZFREE(s->ll16);
+   if (s->ll4  != NULL) BZFREE(s->ll4);
+
+   BZFREE(strm->state);
+   strm->state = NULL;
+
+   return BZ_OK;
+}
+
+
+#ifndef BZ_NO_STDIO
+/*---------------------------------------------------*/
+/*--- File I/O stuff                              ---*/
+/*---------------------------------------------------*/
+
+#define BZ_SETERR(eee)                    \
+{                                         \
+   if (bzerror != NULL) *bzerror = eee;   \
+   if (bzf != NULL) bzf->lastErr = eee;   \
+}
+
+typedef 
+   struct {
+      FILE*     handle;
+      Char      buf[BZ_MAX_UNUSED];
+      Int32     bufN;
+      Bool      writing;
+      bz_stream strm;
+      Int32     lastErr;
+      Bool      initialisedOk;
+   }
+   bzFile;
+
+
+/*---------------------------------------------*/
+static Bool myfeof ( FILE* f )
+{
+   Int32 c = fgetc ( f );
+   if (c == EOF) return True;
+   ungetc ( c, f );
+   return False;
+}
+
+
+/*---------------------------------------------------*/
+BZFILE* BZ_API(BZ2_bzWriteOpen) 
+                    ( int*  bzerror,      
+                      FILE* f, 
+                      int   blockSize100k, 
+                      int   verbosity,
+                      int   workFactor )
+{
+   Int32   ret;
+   bzFile* bzf = NULL;
+
+   BZ_SETERR(BZ_OK);
+
+   if (f == NULL ||
+       (blockSize100k < 1 || blockSize100k > 9) ||
+       (workFactor < 0 || workFactor > 250) ||
+       (verbosity < 0 || verbosity > 4))
+      { BZ_SETERR(BZ_PARAM_ERROR); return NULL; };
+
+   if (ferror(f))
+      { BZ_SETERR(BZ_IO_ERROR); return NULL; };
+
+   bzf = malloc ( sizeof(bzFile) );
+   if (bzf == NULL)
+      { BZ_SETERR(BZ_MEM_ERROR); return NULL; };
+
+   BZ_SETERR(BZ_OK);
+   bzf->initialisedOk = False;
+   bzf->bufN          = 0;
+   bzf->handle        = f;
+   bzf->writing       = True;
+   bzf->strm.bzalloc  = NULL;
+   bzf->strm.bzfree   = NULL;
+   bzf->strm.opaque   = NULL;
+
+   if (workFactor == 0) workFactor = 30;
+   ret = BZ2_bzCompressInit ( &(bzf->strm), blockSize100k, 
+                              verbosity, workFactor );
+   if (ret != BZ_OK)
+      { BZ_SETERR(ret); free(bzf); return NULL; };
+
+   bzf->strm.avail_in = 0;
+   bzf->initialisedOk = True;
+   return bzf;   
+}
+
+
+
+/*---------------------------------------------------*/
+void BZ_API(BZ2_bzWrite)
+             ( int*    bzerror, 
+               BZFILE* b, 
+               void*   buf, 
+               int     len )
+{
+   Int32 n, n2, ret;
+   bzFile* bzf = (bzFile*)b;
+
+   BZ_SETERR(BZ_OK);
+   if (bzf == NULL || buf == NULL || len < 0)
+      { BZ_SETERR(BZ_PARAM_ERROR); return; };
+   if (!(bzf->writing))
+      { BZ_SETERR(BZ_SEQUENCE_ERROR); return; };
+   if (ferror(bzf->handle))
+      { BZ_SETERR(BZ_IO_ERROR); return; };
+
+   if (len == 0)
+      { BZ_SETERR(BZ_OK); return; };
+
+   bzf->strm.avail_in = len;
+   bzf->strm.next_in  = buf;
+
+   while (True) {
+      bzf->strm.avail_out = BZ_MAX_UNUSED;
+      bzf->strm.next_out = bzf->buf;
+      ret = BZ2_bzCompress ( &(bzf->strm), BZ_RUN );
+      if (ret != BZ_RUN_OK)
+         { BZ_SETERR(ret); return; };
+
+      if (bzf->strm.avail_out < BZ_MAX_UNUSED) {
+         n = BZ_MAX_UNUSED - bzf->strm.avail_out;
+         n2 = fwrite ( (void*)(bzf->buf), sizeof(UChar), 
+                       n, bzf->handle );
+         if (n != n2 || ferror(bzf->handle))
+            { BZ_SETERR(BZ_IO_ERROR); return; };
+      }
+
+      if (bzf->strm.avail_in == 0)
+         { BZ_SETERR(BZ_OK); return; };
+   }
+}
+
+
+/*---------------------------------------------------*/
+void BZ_API(BZ2_bzWriteClose)
+                  ( int*          bzerror, 
+                    BZFILE*       b, 
+                    int           abandon,
+                    unsigned int* nbytes_in,
+                    unsigned int* nbytes_out )
+{
+   BZ2_bzWriteClose64 ( bzerror, b, abandon, 
+                        nbytes_in, NULL, nbytes_out, NULL );
+}
+
+
+void BZ_API(BZ2_bzWriteClose64)
+                  ( int*          bzerror, 
+                    BZFILE*       b, 
+                    int           abandon,
+                    unsigned int* nbytes_in_lo32,
+                    unsigned int* nbytes_in_hi32,
+                    unsigned int* nbytes_out_lo32,
+                    unsigned int* nbytes_out_hi32 )
+{
+   Int32   n, n2, ret;
+   bzFile* bzf = (bzFile*)b;
+
+   if (bzf == NULL)
+      { BZ_SETERR(BZ_OK); return; };
+   if (!(bzf->writing))
+      { BZ_SETERR(BZ_SEQUENCE_ERROR); return; };
+   if (ferror(bzf->handle))
+      { BZ_SETERR(BZ_IO_ERROR); return; };
+
+   if (nbytes_in_lo32 != NULL) *nbytes_in_lo32 = 0;
+   if (nbytes_in_hi32 != NULL) *nbytes_in_hi32 = 0;
+   if (nbytes_out_lo32 != NULL) *nbytes_out_lo32 = 0;
+   if (nbytes_out_hi32 != NULL) *nbytes_out_hi32 = 0;
+
+   if ((!abandon) && bzf->lastErr == BZ_OK) {
+      while (True) {
+         bzf->strm.avail_out = BZ_MAX_UNUSED;
+         bzf->strm.next_out = bzf->buf;
+         ret = BZ2_bzCompress ( &(bzf->strm), BZ_FINISH );
+         if (ret != BZ_FINISH_OK && ret != BZ_STREAM_END)
+            { BZ_SETERR(ret); return; };
+
+         if (bzf->strm.avail_out < BZ_MAX_UNUSED) {
+            n = BZ_MAX_UNUSED - bzf->strm.avail_out;
+            n2 = fwrite ( (void*)(bzf->buf), sizeof(UChar), 
+                          n, bzf->handle );
+            if (n != n2 || ferror(bzf->handle))
+               { BZ_SETERR(BZ_IO_ERROR); return; };
+         }
+
+         if (ret == BZ_STREAM_END) break;
+      }
+   }
+
+   if ( !abandon && !ferror ( bzf->handle ) ) {
+      fflush ( bzf->handle );
+      if (ferror(bzf->handle))
+         { BZ_SETERR(BZ_IO_ERROR); return; };
+   }
+
+   if (nbytes_in_lo32 != NULL)
+      *nbytes_in_lo32 = bzf->strm.total_in_lo32;
+   if (nbytes_in_hi32 != NULL)
+      *nbytes_in_hi32 = bzf->strm.total_in_hi32;
+   if (nbytes_out_lo32 != NULL)
+      *nbytes_out_lo32 = bzf->strm.total_out_lo32;
+   if (nbytes_out_hi32 != NULL)
+      *nbytes_out_hi32 = bzf->strm.total_out_hi32;
+
+   BZ_SETERR(BZ_OK);
+   BZ2_bzCompressEnd ( &(bzf->strm) );
+   free ( bzf );
+}
+
+
+/*---------------------------------------------------*/
+BZFILE* BZ_API(BZ2_bzReadOpen) 
+                   ( int*  bzerror, 
+                     FILE* f, 
+                     int   verbosity,
+                     int   small,
+                     void* unused,
+                     int   nUnused )
+{
+   bzFile* bzf = NULL;
+   int     ret;
+
+   BZ_SETERR(BZ_OK);
+
+   if (f == NULL || 
+       (small != 0 && small != 1) ||
+       (verbosity < 0 || verbosity > 4) ||
+       (unused == NULL && nUnused != 0) ||
+       (unused != NULL && (nUnused < 0 || nUnused > BZ_MAX_UNUSED)))
+      { BZ_SETERR(BZ_PARAM_ERROR); return NULL; };
+
+   if (ferror(f))
+      { BZ_SETERR(BZ_IO_ERROR); return NULL; };
+
+   bzf = malloc ( sizeof(bzFile) );
+   if (bzf == NULL) 
+      { BZ_SETERR(BZ_MEM_ERROR); return NULL; };
+
+   BZ_SETERR(BZ_OK);
+
+   bzf->initialisedOk = False;
+   bzf->handle        = f;
+   bzf->bufN          = 0;
+   bzf->writing       = False;
+   bzf->strm.bzalloc  = NULL;
+   bzf->strm.bzfree   = NULL;
+   bzf->strm.opaque   = NULL;
+   
+   while (nUnused > 0) {
+      bzf->buf[bzf->bufN] = *((UChar*)(unused)); bzf->bufN++;
+      unused = ((void*)( 1 + ((UChar*)(unused))  ));
+      nUnused--;
+   }
+
+   ret = BZ2_bzDecompressInit ( &(bzf->strm), verbosity, small );
+   if (ret != BZ_OK)
+      { BZ_SETERR(ret); free(bzf); return NULL; };
+
+   bzf->strm.avail_in = bzf->bufN;
+   bzf->strm.next_in  = bzf->buf;
+
+   bzf->initialisedOk = True;
+   return bzf;   
+}
+
+
+/*---------------------------------------------------*/
+void BZ_API(BZ2_bzReadClose) ( int *bzerror, BZFILE *b )
+{
+   bzFile* bzf = (bzFile*)b;
+
+   BZ_SETERR(BZ_OK);
+   if (bzf == NULL)
+      { BZ_SETERR(BZ_OK); return; };
+
+   if (bzf->writing)
+      { BZ_SETERR(BZ_SEQUENCE_ERROR); return; };
+
+   if (bzf->initialisedOk)
+      (void)BZ2_bzDecompressEnd ( &(bzf->strm) );
+   free ( bzf );
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzRead) 
+           ( int*    bzerror, 
+             BZFILE* b, 
+             void*   buf, 
+             int     len )
+{
+   Int32   n, ret;
+   bzFile* bzf = (bzFile*)b;
+
+   BZ_SETERR(BZ_OK);
+
+   if (bzf == NULL || buf == NULL || len < 0)
+      { BZ_SETERR(BZ_PARAM_ERROR); return 0; };
+
+   if (bzf->writing)
+      { BZ_SETERR(BZ_SEQUENCE_ERROR); return 0; };
+
+   if (len == 0)
+      { BZ_SETERR(BZ_OK); return 0; };
+
+   bzf->strm.avail_out = len;
+   bzf->strm.next_out = buf;
+
+   while (True) {
+
+      if (ferror(bzf->handle)) 
+         { BZ_SETERR(BZ_IO_ERROR); return 0; };
+
+      if (bzf->strm.avail_in == 0 && !myfeof(bzf->handle)) {
+         n = fread ( bzf->buf, sizeof(UChar), 
+                     BZ_MAX_UNUSED, bzf->handle );
+         if (ferror(bzf->handle))
+            { BZ_SETERR(BZ_IO_ERROR); return 0; };
+         bzf->bufN = n;
+         bzf->strm.avail_in = bzf->bufN;
+         bzf->strm.next_in = bzf->buf;
+      }
+
+      ret = BZ2_bzDecompress ( &(bzf->strm) );
+
+      if (ret != BZ_OK && ret != BZ_STREAM_END)
+         { BZ_SETERR(ret); return 0; };
+
+      if (ret == BZ_OK && myfeof(bzf->handle) && 
+          bzf->strm.avail_in == 0 && bzf->strm.avail_out > 0)
+         { BZ_SETERR(BZ_UNEXPECTED_EOF); return 0; };
+
+      if (ret == BZ_STREAM_END)
+         { BZ_SETERR(BZ_STREAM_END);
+           return len - bzf->strm.avail_out; };
+      if (bzf->strm.avail_out == 0)
+         { BZ_SETERR(BZ_OK); return len; };
+      
+   }
+
+   return 0; /*not reached*/
+}
+
+
+/*---------------------------------------------------*/
+void BZ_API(BZ2_bzReadGetUnused) 
+                     ( int*    bzerror, 
+                       BZFILE* b, 
+                       void**  unused, 
+                       int*    nUnused )
+{
+   bzFile* bzf = (bzFile*)b;
+   if (bzf == NULL)
+      { BZ_SETERR(BZ_PARAM_ERROR); return; };
+   if (bzf->lastErr != BZ_STREAM_END)
+      { BZ_SETERR(BZ_SEQUENCE_ERROR); return; };
+   if (unused == NULL || nUnused == NULL)
+      { BZ_SETERR(BZ_PARAM_ERROR); return; };
+
+   BZ_SETERR(BZ_OK);
+   *nUnused = bzf->strm.avail_in;
+   *unused = bzf->strm.next_in;
+}
+#endif
+
+
+/*---------------------------------------------------*/
+/*--- Misc convenience stuff                      ---*/
+/*---------------------------------------------------*/
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzBuffToBuffCompress) 
+                         ( char*         dest, 
+                           unsigned int* destLen,
+                           char*         source, 
+                           unsigned int  sourceLen,
+                           int           blockSize100k, 
+                           int           verbosity, 
+                           int           workFactor )
+{
+   bz_stream strm;
+   int ret;
+
+   if (dest == NULL || destLen == NULL || 
+       source == NULL ||
+       blockSize100k < 1 || blockSize100k > 9 ||
+       verbosity < 0 || verbosity > 4 ||
+       workFactor < 0 || workFactor > 250) 
+      return BZ_PARAM_ERROR;
+
+   if (workFactor == 0) workFactor = 30;
+   strm.bzalloc = NULL;
+   strm.bzfree = NULL;
+   strm.opaque = NULL;
+
+   ret = BZ2_bzCompressInit ( &strm, blockSize100k, 
+                              verbosity, workFactor );
+   if (ret != BZ_OK) return ret;
+
+   strm.next_in = source;
+   strm.next_out = dest;
+   strm.avail_in = sourceLen;
+   strm.avail_out = *destLen;
+
+   ret = BZ2_bzCompress ( &strm, BZ_FINISH );
+   if (ret == BZ_FINISH_OK) goto output_overflow;
+   if (ret != BZ_STREAM_END) goto errhandler;
+
+   /* normal termination */
+   *destLen -= strm.avail_out;   
+   BZ2_bzCompressEnd ( &strm );
+   return BZ_OK;
+
+   output_overflow:
+   BZ2_bzCompressEnd ( &strm );
+   return BZ_OUTBUFF_FULL;
+
+   errhandler:
+   BZ2_bzCompressEnd ( &strm );
+   return ret;
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzBuffToBuffDecompress) 
+                           ( char*         dest, 
+                             unsigned int* destLen,
+                             char*         source, 
+                             unsigned int  sourceLen,
+                             int           small,
+                             int           verbosity )
+{
+   bz_stream strm;
+   int ret;
+
+   if (dest == NULL || destLen == NULL || 
+       source == NULL ||
+       (small != 0 && small != 1) ||
+       verbosity < 0 || verbosity > 4) 
+          return BZ_PARAM_ERROR;
+
+   strm.bzalloc = NULL;
+   strm.bzfree = NULL;
+   strm.opaque = NULL;
+   ret = BZ2_bzDecompressInit ( &strm, verbosity, small );
+   if (ret != BZ_OK) return ret;
+
+   strm.next_in = source;
+   strm.next_out = dest;
+   strm.avail_in = sourceLen;
+   strm.avail_out = *destLen;
+
+   ret = BZ2_bzDecompress ( &strm );
+   if (ret == BZ_OK) goto output_overflow_or_eof;
+   if (ret != BZ_STREAM_END) goto errhandler;
+
+   /* normal termination */
+   *destLen -= strm.avail_out;
+   BZ2_bzDecompressEnd ( &strm );
+   return BZ_OK;
+
+   output_overflow_or_eof:
+   if (strm.avail_out > 0) {
+      BZ2_bzDecompressEnd ( &strm );
+      return BZ_UNEXPECTED_EOF;
+   } else {
+      BZ2_bzDecompressEnd ( &strm );
+      return BZ_OUTBUFF_FULL;
+   };      
+
+   errhandler:
+   BZ2_bzDecompressEnd ( &strm );
+   return ret; 
+}
+
+
+/*---------------------------------------------------*/
+/*--
+   Code contributed by Yoshioka Tsuneo
+   (QWF00133@niftyserve.or.jp/tsuneo-y@is.aist-nara.ac.jp),
+   to support better zlib compatibility.
+   This code is not _officially_ part of libbzip2 (yet);
+   I haven't tested it, documented it, or considered the
+   threading-safeness of it.
+   If this code breaks, please contact both Yoshioka and me.
+--*/
+/*---------------------------------------------------*/
+
+/*---------------------------------------------------*/
+/*--
+   return version like "0.9.0c".
+--*/
+const char * BZ_API(BZ2_bzlibVersion)(void)
+{
+   return BZ_VERSION;
+}
+
+
+#ifndef BZ_NO_STDIO
+/*---------------------------------------------------*/
+
+#if defined(_WIN32) || defined(OS2) || defined(MSDOS)
+#   include <fcntl.h>
+#   include <io.h>
+#   define SET_BINARY_MODE(file) setmode(fileno(file),O_BINARY)
+#else
+#   define SET_BINARY_MODE(file)
+#endif
+static
+BZFILE * bzopen_or_bzdopen
+               ( const char *path,   /* no use when bzdopen */
+                 int fd,             /* no use when bzdopen */
+                 const char *mode,
+                 int open_mode)      /* bzopen: 0, bzdopen:1 */
+{
+   int    bzerr;
+   char   unused[BZ_MAX_UNUSED];
+   int    blockSize100k = 9;
+   int    writing       = 0;
+   char   mode2[10]     = "";
+   FILE   *fp           = NULL;
+   BZFILE *bzfp         = NULL;
+   int    verbosity     = 0;
+   int    workFactor    = 30;
+   int    smallMode     = 0;
+   int    nUnused       = 0; 
+
+   if (mode == NULL) return NULL;
+   while (*mode) {
+      switch (*mode) {
+      case 'r':
+         writing = 0; break;
+      case 'w':
+         writing = 1; break;
+      case 's':
+         smallMode = 1; break;
+      default:
+         if (isdigit((int)(*mode))) {
+            blockSize100k = *mode-BZ_HDR_0;
+         }
+      }
+      mode++;
+   }
+   strcat(mode2, writing ? "w" : "r" );
+   strcat(mode2,"b");   /* binary mode */
+
+   if (open_mode==0) {
+      if (path==NULL || strcmp(path,"")==0) {
+        fp = (writing ? stdout : stdin);
+        SET_BINARY_MODE(fp);
+      } else {
+        fp = fopen(path,mode2);
+      }
+   } else {
+#ifdef BZ_STRICT_ANSI
+      fp = NULL;
+#else
+      fp = fdopen(fd,mode2);
+#endif
+   }
+   if (fp == NULL) return NULL;
+
+   if (writing) {
+      /* Guard against total chaos and anarchy -- JRS */
+      if (blockSize100k < 1) blockSize100k = 1;
+      if (blockSize100k > 9) blockSize100k = 9; 
+      bzfp = BZ2_bzWriteOpen(&bzerr,fp,blockSize100k,
+                             verbosity,workFactor);
+   } else {
+      bzfp = BZ2_bzReadOpen(&bzerr,fp,verbosity,smallMode,
+                            unused,nUnused);
+   }
+   if (bzfp == NULL) {
+      if (fp != stdin && fp != stdout) fclose(fp);
+      return NULL;
+   }
+   return bzfp;
+}
+
+
+/*---------------------------------------------------*/
+/*--
+   open file for read or write.
+      ex) bzopen("file","w9")
+      case path="" or NULL => use stdin or stdout.
+--*/
+BZFILE * BZ_API(BZ2_bzopen)
+               ( const char *path,
+                 const char *mode )
+{
+   return bzopen_or_bzdopen(path,-1,mode,/*bzopen*/0);
+}
+
+
+/*---------------------------------------------------*/
+BZFILE * BZ_API(BZ2_bzdopen)
+               ( int fd,
+                 const char *mode )
+{
+   return bzopen_or_bzdopen(NULL,fd,mode,/*bzdopen*/1);
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzread) (BZFILE* b, void* buf, int len )
+{
+   int bzerr, nread;
+   if (((bzFile*)b)->lastErr == BZ_STREAM_END) return 0;
+   nread = BZ2_bzRead(&bzerr,b,buf,len);
+   if (bzerr == BZ_OK || bzerr == BZ_STREAM_END) {
+      return nread;
+   } else {
+      return -1;
+   }
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzwrite) (BZFILE* b, void* buf, int len )
+{
+   int bzerr;
+
+   BZ2_bzWrite(&bzerr,b,buf,len);
+   if(bzerr == BZ_OK){
+      return len;
+   }else{
+      return -1;
+   }
+}
+
+
+/*---------------------------------------------------*/
+int BZ_API(BZ2_bzflush) (BZFILE *b)
+{
+   /* do nothing now... */
+   return 0;
+}
+
+
+/*---------------------------------------------------*/
+void BZ_API(BZ2_bzclose) (BZFILE* b)
+{
+   int bzerr;
+   FILE *fp = ((bzFile *)b)->handle;
+   
+   if (b==NULL) {return;}
+   if(((bzFile*)b)->writing){
+      BZ2_bzWriteClose(&bzerr,b,0,NULL,NULL);
+      if(bzerr != BZ_OK){
+         BZ2_bzWriteClose(NULL,b,1,NULL,NULL);
+      }
+   }else{
+      BZ2_bzReadClose(&bzerr,b);
+   }
+   if(fp!=stdin && fp!=stdout){
+      fclose(fp);
+   }
+}
+
+
+/*---------------------------------------------------*/
+/*--
+   return last error code 
+--*/
+static char *bzerrorstrings[] = {
+       "OK"
+      ,"SEQUENCE_ERROR"
+      ,"PARAM_ERROR"
+      ,"MEM_ERROR"
+      ,"DATA_ERROR"
+      ,"DATA_ERROR_MAGIC"
+      ,"IO_ERROR"
+      ,"UNEXPECTED_EOF"
+      ,"OUTBUFF_FULL"
+      ,"CONFIG_ERROR"
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+};
+
+
+const char * BZ_API(BZ2_bzerror) (BZFILE *b, int *errnum)
+{
+   int err = ((bzFile *)b)->lastErr;
+
+   if(err>0) err = 0;
+   *errnum = err;
+   return bzerrorstrings[err*-1];
+}
+#endif
+
+
+/*-------------------------------------------------------------*/
+/*--- end                                           bzlib.c ---*/
+/*-------------------------------------------------------------*/
+
+
+/////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////
+
+
+/* A test program written to test robustness to decompression of
+   corrupted data.  Usage is 
+       unzcrash filename
+   and the program will read the specified file, compress it (in memory),
+   and then repeatedly decompress it, each time with a different bit of
+   the compressed data inverted, so as to test all possible one-bit errors.
+   This should not cause any invalid memory accesses.  If it does, 
+   I want to know about it!
+
+   p.s.  As you can see from the above description, the process is
+   incredibly slow.  A file of size eg 5KB will cause it to run for
+   many hours.
+*/
+
+//#include <stdio.h>
+//#include <assert.h>
+//#include "bzlib.h"
+
+#define M_BLOCK 1000000
+
+
+#define M_BLOCK_OUT (M_BLOCK + 1000000)
+ char inbuf[M_BLOCK];
+ char outbuf[M_BLOCK_OUT];
+ char zbuf[M_BLOCK + 600 + (M_BLOCK / 100)];
+
+int nIn;
+unsigned int nOut;
+unsigned int nZ;
+
+#if 0
+static char *bzerrorstrings[] = {
+       "OK"
+      ,"SEQUENCE_ERROR"
+      ,"PARAM_ERROR"
+      ,"MEM_ERROR"
+      ,"DATA_ERROR"
+      ,"DATA_ERROR_MAGIC"
+      ,"IO_ERROR"
+      ,"UNEXPECTED_EOF"
+      ,"OUTBUFF_FULL"
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+      ,"???"   /* for future */
+};
+#endif
+
+void flip_bit ( int bit )
+{
+   int byteno = bit / 8;
+   int bitno  = bit % 8;
+   UChar mask = 1 << bitno;
+   //fprintf ( stderr, "(byte %d  bit %d  mask %d)",
+   //          byteno, bitno, (int)mask );
+   zbuf[byteno] ^= mask;
+}
+
+void set_inbuf ( void )
+{
+  inbuf[0] = 0;
+  my_strcat(inbuf, "At her sixtieth birthday party, Margaret Thatcher ");
+  my_strcat(inbuf, "blew on the cake to light the candles.\n");
+  my_strcat(inbuf, "This program, bzip2, the associated library libbzip2, and all\n");
+  my_strcat(inbuf, "documentation, are copyright (C) 1996-2004 Julian R Seward.  All\n");
+  my_strcat(inbuf, "rights reserved.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "Redistribution and use in source and binary forms, with or without\n");
+  my_strcat(inbuf, "modification, are permitted provided that the following conditions\n");
+  my_strcat(inbuf, "are met:\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "1. Redistributions of source code must retain the above copyright\n");
+  my_strcat(inbuf, "   notice, this list of conditions and the following disclaimer.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "2. The origin of this software must not be misrepresented; you must\n");
+  my_strcat(inbuf, "   not claim that you wrote the original software.  If you use this\n");
+  my_strcat(inbuf, "   software in a product, an acknowledgment in the product\n");
+  my_strcat(inbuf, "   documentation would be appreciated but is not required.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "3. Altered source versions must be plainly marked as such, and must\n");
+  my_strcat(inbuf, "   not be misrepresented as being the original software.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "4. The name of the author may not be used to endorse or promote\n");
+  my_strcat(inbuf, "   products derived from this software without specific prior written\n");
+  my_strcat(inbuf, "   permission.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS\n");
+  my_strcat(inbuf, "OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n");
+  my_strcat(inbuf, "WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n");
+  my_strcat(inbuf, "ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY\n");
+  my_strcat(inbuf, "DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n");
+  my_strcat(inbuf, "DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE\n");
+  my_strcat(inbuf, "GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n");
+  my_strcat(inbuf, "INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n");
+  my_strcat(inbuf, "WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n");
+  my_strcat(inbuf, "NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n");
+  my_strcat(inbuf, "SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "ababababababababababababababababababababababababababababababab");
+  my_strcat(inbuf, "		    GNU GENERAL PUBLIC LICENSE\n");
+  my_strcat(inbuf, "		       Version 2, June 1991\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, " Copyright (C) 1989, 1991 Free Software Foundation, Inc.\n");
+  my_strcat(inbuf, "     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA\n");
+  my_strcat(inbuf, " Everyone is permitted to copy and distribute verbatim copies\n");
+  my_strcat(inbuf, " of this license document, but changing it is not allowed.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "			    Preamble\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  The licenses for most software are designed to take away your\n");
+  my_strcat(inbuf, "freedom to share and change it.  By contrast, the GNU General Public\n");
+  my_strcat(inbuf, "License is intended to guarantee your freedom to share and change free\n");
+  my_strcat(inbuf, "software--to make sure the software is free for all its users.  This\n");
+  my_strcat(inbuf, "General Public License applies to most of the Free Software\n");
+  my_strcat(inbuf, "Foundation's software and to any other program whose authors commit to\n");
+  my_strcat(inbuf, "using it.  (Some other Free Software Foundation software is covered by\n");
+  my_strcat(inbuf, "the GNU Library General Public License instead.)  You can apply it to\n");
+  my_strcat(inbuf, "your programs, too.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  When we speak of free software, we are referring to freedom, not\n");
+  my_strcat(inbuf, "price.  Our General Public Licenses are designed to make sure that you\n");
+  my_strcat(inbuf, "have the freedom to distribute copies of free software (and charge for\n");
+  my_strcat(inbuf, "this service if you wish), that you receive source code or can get it\n");
+  my_strcat(inbuf, "if you want it, that you can change the software or use pieces of it\n");
+  my_strcat(inbuf, "in new free programs; and that you know you can do these things.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  To protect your rights, we need to make restrictions that forbid\n");
+  my_strcat(inbuf, "anyone to deny you these rights or to ask you to surrender the rights.\n");
+  my_strcat(inbuf, "These restrictions translate to certain responsibilities for you if you\n");
+  my_strcat(inbuf, "distribute copies of the software, or if you modify it.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  For example, if you distribute copies of such a program, whether\n");
+  my_strcat(inbuf, "gratis or for a fee, you must give the recipients all the rights that\n");
+  my_strcat(inbuf, "you have.  You must make sure that they, too, receive or can get the\n");
+  my_strcat(inbuf, "source code.  And you must show them these terms so they know their\n");
+  my_strcat(inbuf, "rights.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  We protect your rights with two steps: (1) copyright the software, and\n");
+  my_strcat(inbuf, "(2) offer you this license which gives you legal permission to copy,\n");
+  my_strcat(inbuf, "distribute and/or modify the software.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  Also, for each author's protection and ours, we want to make certain\n");
+  my_strcat(inbuf, "that everyone understands that there is no warranty for this free\n");
+  my_strcat(inbuf, "software.  If the software is modified by someone else and passed on, we\n");
+  my_strcat(inbuf, "want its recipients to know that what they have is not the original, so\n");
+  my_strcat(inbuf, "that any problems introduced by others will not reflect on the original\n");
+  my_strcat(inbuf, "authors' reputations.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  Finally, any free program is threatened constantly by software\n");
+  my_strcat(inbuf, "patents.  We wish to avoid the danger that redistributors of a free\n");
+  my_strcat(inbuf, "program will individually obtain patent licenses, in effect making the\n");
+  my_strcat(inbuf, "program proprietary.  To prevent this, we have made it clear that any\n");
+  my_strcat(inbuf, "patent must be licensed for everyone's free use or not licensed at all.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  The precise terms and conditions for copying, distribution and\n");
+  my_strcat(inbuf, "modification follow.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "		    GNU GENERAL PUBLIC LICENSE\n");
+  my_strcat(inbuf, "   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  0. This License applies to any program or other work which contains\n");
+  my_strcat(inbuf, "a notice placed by the copyright holder saying it may be distributed\n");
+  my_strcat(inbuf, "under the terms of this General Public License.  The Program, below,\n");
+  my_strcat(inbuf, "refers to any such program or work, and a work based on the Program\n");
+  my_strcat(inbuf, "means either the Program or any derivative work under copyright law:\n");
+  my_strcat(inbuf, "that is to say, a work containing the Program or a portion of it,\n");
+  my_strcat(inbuf, "either verbatim or with modifications and/or translated into another\n");
+  my_strcat(inbuf, "language.  (Hereinafter, translation is included without limitation in\n");
+  my_strcat(inbuf, "the term modification.)  Each licensee is addressed as you.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "Activities other than copying, distribution and modification are not\n");
+  my_strcat(inbuf, "covered by this License; they are outside its scope.  The act of\n");
+  my_strcat(inbuf, "running the Program is not restricted, and the output from the Program\n");
+  my_strcat(inbuf, "is covered only if its contents constitute a work based on the\n");
+  my_strcat(inbuf, "Program (independent of having been made by running the Program).\n");
+  my_strcat(inbuf, "Whether that is true depends on what the Program does.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  1. You may copy and distribute verbatim copies of the Program's\n");
+  my_strcat(inbuf, "source code as you receive it, in any medium, provided that you\n");
+  my_strcat(inbuf, "conspicuously and appropriately publish on each copy an appropriate\n");
+  my_strcat(inbuf, "copyright notice and disclaimer of warranty; keep intact all the\n");
+  my_strcat(inbuf, "notices that refer to this License and to the absence of any warranty;\n");
+  my_strcat(inbuf, "and give any other recipients of the Program a copy of this License\n");
+  my_strcat(inbuf, "along with the Program.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "You may charge a fee for the physical act of transferring a copy, and\n");
+  my_strcat(inbuf, "you may at your option offer warranty protection in exchange for a fee.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  2. You may modify your copy or copies of the Program or any portion\n");
+  my_strcat(inbuf, "of it, thus forming a work based on the Program, and copy and\n");
+  my_strcat(inbuf, "distribute such modifications or work under the terms of Section 1\n");
+  my_strcat(inbuf, "above, provided that you also meet all of these conditions:\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    a) You must cause the modified files to carry prominent notices\n");
+  my_strcat(inbuf, "    stating that you changed the files and the date of any change.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    b) You must cause any work that you distribute or publish, that in\n");
+  my_strcat(inbuf, "    whole or in part contains or is derived from the Program or any\n");
+  my_strcat(inbuf, "    part thereof, to be licensed as a whole at no charge to all third\n");
+  my_strcat(inbuf, "    parties under the terms of this License.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    c) If the modified program normally reads commands interactively\n");
+  my_strcat(inbuf, "    when run, you must cause it, when started running for such\n");
+  my_strcat(inbuf, "    interactive use in the most ordinary way, to print or display an\n");
+  my_strcat(inbuf, "    announcement including an appropriate copyright notice and a\n");
+  my_strcat(inbuf, "    notice that there is no warranty (or else, saying that you provide\n");
+  my_strcat(inbuf, "    a warranty) and that users may redistribute the program under\n");
+  my_strcat(inbuf, "    these conditions, and telling the user how to view a copy of this\n");
+  my_strcat(inbuf, "    License.  (Exception: if the Program itself is interactive but\n");
+  my_strcat(inbuf, "    does not normally print such an announcement, your work based on\n");
+  my_strcat(inbuf, "    the Program is not required to print an announcement.)\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "These requirements apply to the modified work as a whole.  If\n");
+  my_strcat(inbuf, "identifiable sections of that work are not derived from the Program,\n");
+  my_strcat(inbuf, "and can be reasonably considered independent and separate works in\n");
+  my_strcat(inbuf, "themselves, then this License, and its terms, do not apply to those\n");
+  my_strcat(inbuf, "sections when you distribute them as separate works.  But when you\n");
+  my_strcat(inbuf, "distribute the same sections as part of a whole which is a work based\n");
+  my_strcat(inbuf, "on the Program, the distribution of the whole must be on the terms of\n");
+  my_strcat(inbuf, "this License, whose permissions for other licensees extend to the\n");
+  my_strcat(inbuf, "entire whole, and thus to each and every part regardless of who wrote it.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "Thus, it is not the intent of this section to claim rights or contest\n");
+  my_strcat(inbuf, "your rights to work written entirely by you; rather, the intent is to\n");
+  my_strcat(inbuf, "exercise the right to control the distribution of derivative or\n");
+  my_strcat(inbuf, "collective works based on the Program.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "In addition, mere aggregation of another work not based on the Program\n");
+  my_strcat(inbuf, "with the Program (or with a work based on the Program) on a volume of\n");
+  my_strcat(inbuf, "a storage or distribution medium does not bring the other work under\n");
+  my_strcat(inbuf, "the scope of this License.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  3. You may copy and distribute the Program (or a work based on it,\n");
+  my_strcat(inbuf, "under Section 2) in object code or executable form under the terms of\n");
+  my_strcat(inbuf, "Sections 1 and 2 above provided that you also do one of the following:\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    a) Accompany it with the complete corresponding machine-readable\n");
+  my_strcat(inbuf, "    source code, which must be distributed under the terms of Sections\n");
+  my_strcat(inbuf, "    1 and 2 above on a medium customarily used for software interchange; or,\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    b) Accompany it with a written offer, valid for at least three\n");
+  my_strcat(inbuf, "    years, to give any third party, for a charge no more than your\n");
+  my_strcat(inbuf, "    cost of physically performing source distribution, a complete\n");
+  my_strcat(inbuf, "    machine-readable copy of the corresponding source code, to be\n");
+  my_strcat(inbuf, "    distributed under the terms of Sections 1 and 2 above on a medium\n");
+  my_strcat(inbuf, "    customarily used for software interchange; or,\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    c) Accompany it with the information you received as to the offer\n");
+  my_strcat(inbuf, "    to distribute corresponding source code.  (This alternative is\n");
+  my_strcat(inbuf, "    allowed only for noncommercial distribution and only if you\n");
+  my_strcat(inbuf, "    received the program in object code or executable form with such\n");
+  my_strcat(inbuf, "    an offer, in accord with Subsection b above.)\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "The source code for a work means the preferred form of the work for\n");
+  my_strcat(inbuf, "making modifications to it.  For an executable work, complete source\n");
+  my_strcat(inbuf, "code means all the source code for all modules it contains, plus any\n");
+  my_strcat(inbuf, "associated interface definition files, plus the scripts used to\n");
+  my_strcat(inbuf, "control compilation and installation of the executable.  However, as a\n");
+  my_strcat(inbuf, "special exception, the source code distributed need not include\n");
+  my_strcat(inbuf, "anything that is normally distributed (in either source or binary\n");
+  my_strcat(inbuf, "form) with the major components (compiler, kernel, and so on) of the\n");
+  my_strcat(inbuf, "operating system on which the executable runs, unless that component\n");
+  my_strcat(inbuf, "itself accompanies the executable.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "If distribution of executable or object code is made by offering\n");
+  my_strcat(inbuf, "access to copy from a designated place, then offering equivalent\n");
+  my_strcat(inbuf, "access to copy the source code from the same place counts as\n");
+  my_strcat(inbuf, "distribution of the source code, even though third parties are not\n");
+  my_strcat(inbuf, "compelled to copy the source along with the object code.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  4. You may not copy, modify, sublicense, or distribute the Program\n");
+  my_strcat(inbuf, "except as expressly provided under this License.  Any attempt\n");
+  my_strcat(inbuf, "otherwise to copy, modify, sublicense or distribute the Program is\n");
+  my_strcat(inbuf, "void, and will automatically terminate your rights under this License.\n");
+  my_strcat(inbuf, "However, parties who have received copies, or rights, from you under\n");
+  my_strcat(inbuf, "this License will not have their licenses terminated so long as such\n");
+  my_strcat(inbuf, "parties remain in full compliance.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  5. You are not required to accept this License, since you have not\n");
+  my_strcat(inbuf, "signed it.  However, nothing else grants you permission to modify or\n");
+  my_strcat(inbuf, "distribute the Program or its derivative works.  These actions are\n");
+  my_strcat(inbuf, "prohibited by law if you do not accept this License.  Therefore, by\n");
+  my_strcat(inbuf, "modifying or distributing the Program (or any work based on the\n");
+  my_strcat(inbuf, "Program), you indicate your acceptance of this License to do so, and\n");
+  my_strcat(inbuf, "all its terms and conditions for copying, distributing or modifying\n");
+  my_strcat(inbuf, "the Program or works based on it.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  6. Each time you redistribute the Program (or any work based on the\n");
+  my_strcat(inbuf, "Program), the recipient automatically receives a license from the\n");
+  my_strcat(inbuf, "original licensor to copy, distribute or modify the Program subject to\n");
+  my_strcat(inbuf, "these terms and conditions.  You may not impose any further\n");
+  my_strcat(inbuf, "restrictions on the recipients' exercise of the rights granted herein.\n");
+  my_strcat(inbuf, "You are not responsible for enforcing compliance by third parties to\n");
+  my_strcat(inbuf, "this License.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  7. If, as a consequence of a court judgment or allegation of patent\n");
+  my_strcat(inbuf, "infringement or for any other reason (not limited to patent issues),\n");
+  my_strcat(inbuf, "conditions are imposed on you (whether by court order, agreement or\n");
+  my_strcat(inbuf, "otherwise) that contradict the conditions of this License, they do not\n");
+  my_strcat(inbuf, "excuse you from the conditions of this License.  If you cannot\n");
+  my_strcat(inbuf, "distribute so as to satisfy simultaneously your obligations under this\n");
+  my_strcat(inbuf, "License and any other pertinent obligations, then as a consequence you\n");
+  my_strcat(inbuf, "may not distribute the Program at all.  For example, if a patent\n");
+  my_strcat(inbuf, "license would not permit royalty-free redistribution of the Program by\n");
+  my_strcat(inbuf, "all those who receive copies directly or indirectly through you, then\n");
+  my_strcat(inbuf, "the only way you could satisfy both it and this License would be to\n");
+  my_strcat(inbuf, "refrain entirely from distribution of the Program.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "If any portion of this section is held invalid or unenforceable under\n");
+  my_strcat(inbuf, "any particular circumstance, the balance of the section is intended to\n");
+  my_strcat(inbuf, "apply and the section as a whole is intended to apply in other\n");
+  my_strcat(inbuf, "circumstances.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "It is not the purpose of this section to induce you to infringe any\n");
+  my_strcat(inbuf, "patents or other property right claims or to contest validity of any\n");
+  my_strcat(inbuf, "such claims; this section has the sole purpose of protecting the\n");
+  my_strcat(inbuf, "integrity of the free software distribution system, which is\n");
+  my_strcat(inbuf, "implemented by public license practices.  Many people have made\n");
+  my_strcat(inbuf, "generous contributions to the wide range of software distributed\n");
+  my_strcat(inbuf, "through that system in reliance on consistent application of that\n");
+  my_strcat(inbuf, "system; it is up to the author/donor to decide if he or she is willing\n");
+  my_strcat(inbuf, "to distribute software through any other system and a licensee cannot\n");
+  my_strcat(inbuf, "impose that choice.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "This section is intended to make thoroughly clear what is believed to\n");
+  my_strcat(inbuf, "be a consequence of the rest of this License.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  8. If the distribution and/or use of the Program is restricted in\n");
+  my_strcat(inbuf, "certain countries either by patents or by copyrighted interfaces, the\n");
+  my_strcat(inbuf, "original copyright holder who places the Program under this License\n");
+  my_strcat(inbuf, "may add an explicit geographical distribution limitation excluding\n");
+  my_strcat(inbuf, "those countries, so that distribution is permitted only in or among\n");
+  my_strcat(inbuf, "countries not thus excluded.  In such case, this License incorporates\n");
+  my_strcat(inbuf, "the limitation as if written in the body of this License.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  9. The Free Software Foundation may publish revised and/or new versions\n");
+  my_strcat(inbuf, "of the General Public License from time to time.  Such new versions will\n");
+  my_strcat(inbuf, "be similar in spirit to the present version, but may differ in detail to\n");
+  my_strcat(inbuf, "address new problems or concerns.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "Each version is given a distinguishing version number.  If the Program\n");
+  my_strcat(inbuf, "specifies a version number of this License which applies to it and any\n");
+  my_strcat(inbuf, "later version, you have the option of following the terms and conditions\n");
+  my_strcat(inbuf, "either of that version or of any later version published by the Free\n");
+  my_strcat(inbuf, "Software Foundation.  If the Program does not specify a version number of\n");
+  my_strcat(inbuf, "this License, you may choose any version ever published by the Free Software\n");
+  my_strcat(inbuf, "Foundation.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  10. If you wish to incorporate parts of the Program into other free\n");
+  my_strcat(inbuf, "programs whose distribution conditions are different, write to the author\n");
+  my_strcat(inbuf, "to ask for permission.  For software which is copyrighted by the Free\n");
+  my_strcat(inbuf, "Software Foundation, write to the Free Software Foundation; we sometimes\n");
+  my_strcat(inbuf, "make exceptions for this.  Our decision will be guided by the two goals\n");
+  my_strcat(inbuf, "of preserving the free status of all derivatives of our free software and\n");
+  my_strcat(inbuf, "of promoting the sharing and reuse of software generally.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "			    NO WARRANTY\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY\n");
+  my_strcat(inbuf, "FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN\n");
+  my_strcat(inbuf, "OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES\n");
+  my_strcat(inbuf, "PROVIDE THE PROGRAM AS IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED\n");
+  my_strcat(inbuf, "OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\n");
+  my_strcat(inbuf, "MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS\n");
+  my_strcat(inbuf, "TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE\n");
+  my_strcat(inbuf, "PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,\n");
+  my_strcat(inbuf, "REPAIR OR CORRECTION.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\n");
+  my_strcat(inbuf, "WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR\n");
+  my_strcat(inbuf, "REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,\n");
+  my_strcat(inbuf, "INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING\n");
+  my_strcat(inbuf, "OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED\n");
+  my_strcat(inbuf, "TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY\n");
+  my_strcat(inbuf, "YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER\n");
+  my_strcat(inbuf, "PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE\n");
+  my_strcat(inbuf, "POSSIBILITY OF SUCH DAMAGES.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "		     END OF TERMS AND CONDITIONS\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "	    How to Apply These Terms to Your New Programs\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  If you develop a new program, and you want it to be of the greatest\n");
+  my_strcat(inbuf, "possible use to the public, the best way to achieve this is to make it\n");
+  my_strcat(inbuf, "free software which everyone can redistribute and change under these terms.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  To do so, attach the following notices to the program.  It is safest\n");
+  my_strcat(inbuf, "to attach them to the start of each source file to most effectively\n");
+  my_strcat(inbuf, "convey the exclusion of warranty; and each file should have at least\n");
+  my_strcat(inbuf, "the copyright line and a pointer to where the full notice is found.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    <one line to give the program's name and a brief idea of what it does.>\n");
+  my_strcat(inbuf, "    Copyright (C) <year>  <name of author>\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    This program is free software; you can redistribute it and/or modify\n");
+  my_strcat(inbuf, "    it under the terms of the GNU General Public License as published by\n");
+  my_strcat(inbuf, "    the Free Software Foundation; either version 2 of the License, or\n");
+  my_strcat(inbuf, "    (at your option) any later version.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    This program is distributed in the hope that it will be useful,\n");
+  my_strcat(inbuf, "    but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
+  my_strcat(inbuf, "    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
+  my_strcat(inbuf, "    GNU General Public License for more details.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    You should have received a copy of the GNU General Public License\n");
+  my_strcat(inbuf, "    along with this program; if not, write to the Free Software\n");
+  my_strcat(inbuf, "    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "Also add information on how to contact you by electronic and paper mail.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "If the program is interactive, make it output a short notice like this\n");
+  my_strcat(inbuf, "when it starts in an interactive mode:\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "    Gnomovision version 69, Copyright (C) year  name of author\n");
+  my_strcat(inbuf, "    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.\n");
+  my_strcat(inbuf, "    This is free software, and you are welcome to redistribute it\n");
+  my_strcat(inbuf, "    under certain conditions; type `show c' for details.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "The hypothetical commands `show w' and `show c' should show the appropriate\n");
+  my_strcat(inbuf, "parts of the General Public License.  Of course, the commands you use may\n");
+  my_strcat(inbuf, "be called something other than `show w' and `show c'; they could even be\n");
+  my_strcat(inbuf, "mouse-clicks or menu items--whatever suits your program.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "You should also get your employer (if you work as a programmer) or your\n");
+  my_strcat(inbuf, "school, if any, to sign a copyright disclaimer for the program, if\n");
+  my_strcat(inbuf, "necessary.  Here is a sample; alter the names:\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  Yoyodyne, Inc., hereby disclaims all copyright interest in the program\n");
+  my_strcat(inbuf, "  `Gnomovision' (which makes passes at compilers) written by James Hacker.\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "  <signature of Ty Coon>, 1 April 1989\n");
+  my_strcat(inbuf, "  Ty Coon, President of Vice\n");
+  my_strcat(inbuf, "\n");
+  my_strcat(inbuf, "This General Public License does not permit incorporating your program into\n");
+  my_strcat(inbuf, "proprietary programs.  If your program is a subroutine library, you may\n");
+  my_strcat(inbuf, "consider it more useful to permit linking proprietary applications with the\n");
+  my_strcat(inbuf, "library.  If this is what you want to do, use the GNU Library General\n");
+  my_strcat(inbuf, "Public License instead of this License.\n");
+
+  my_strcat(inbuf, "\n");
+}
+
+#include <stdio.h>
+#include <malloc.h>
+#include <assert.h>
+
+/* For providing services. */
+static HWord g_serviceFn ( HWord arg1, HWord arg2 )
+{
+   switch (arg1) {
+      case 0: /* EXIT */
+         exit(0);
+      case 1: /* PUTC */
+         putchar(arg2);
+         return 0;
+      case 2: /* MALLOC */
+         return (HWord)malloc(arg2);
+      case 3: /* FREE */
+         free((void*)arg2);
+         return 0;
+      default:
+         assert(0);
+   }
+}
+
+static char *bzerrorstrings[] = {
+       "OK"
+       ,"SEQUENCE_ERROR"
+       ,"PARAM_ERROR"
+       ,"MEM_ERROR"
+       ,"DATA_ERROR"
+       ,"DATA_ERROR_MAGIC"
+       ,"IO_ERROR"
+       ,"UNEXPECTED_EOF"
+       ,"OUTBUFF_FULL"
+       ,"CONFIG_ERROR"
+       ,"???"   /* for future */
+       ,"???"   /* for future */
+       ,"???"   /* for future */
+       ,"???"   /* for future */
+       ,"???"   /* for future */
+       ,"???"   /* for future */
+};
+
+#include "../memcheck.h"
+
+// If given a cmd line arg, behave as a correctness regtest
+// (run fast and be verbose).  If not, run for a long time
+// which is what is needed for the performance suite.
+int main ( int argc, char** argv )
+{
+   int   r;
+   int   bit;
+   int   i;
+   int regtest;
+
+   assert(argc == 1 || argc == 2);
+   regtest = argc==2;
+   serviceFn = g_serviceFn;
+
+   set_inbuf();
+   nIn = vex_strlen(inbuf)+1;
+   vex_printf( "%d bytes read\n", nIn );
+
+   /* Make inbuf[10] be undefined, so as to check that this source
+      eventually shows up in various places. */
+   VALGRIND_MAKE_MEM_UNDEFINED(&inbuf[10], sizeof(char));
+
+   if (inbuf[10] == 11) vex_printf("foo\n"); else vex_printf("bar\n");
+
+   nZ = M_BLOCK;
+   r = BZ2_bzBuffToBuffCompress (
+          zbuf, &nZ, inbuf, nIn, 9, 3/*verb*/, 30 );
+
+   if (r != BZ_OK) {
+     vex_printf("initial compress failed!\n");
+     (*serviceFn)(0,0);
+   }
+   vex_printf( "%d after compression\n", nZ );
+
+   for (bit = 0; bit < nZ*8; bit += (bit < 35 ? 1 : (regtest?2377:137))) {
+      if (regtest)
+         vex_printf( "bit %d  ", bit );
+      flip_bit ( bit );
+      nOut = M_BLOCK_OUT;
+      r = BZ2_bzBuffToBuffDecompress (
+             outbuf, &nOut, zbuf, nZ, 1/*small*/, 0 );
+      if (regtest)
+         vex_printf( " %d  %s ", r, bzerrorstrings[-r] );
+
+      if (r != BZ_OK) {
+	 if (regtest)
+            vex_printf( "\n" );
+      } else {
+         if (nOut != nIn) {
+           vex_printf(  "nIn/nOut mismatch %d %d\n", nIn, nOut );
+           (*serviceFn)(0,0);
+         } else {
+           for (i = 0; i < nOut; i++)
+             if (inbuf[i] != outbuf[i]) { 
+                vex_printf(  "mismatch at %d\n", i ); 
+                (*serviceFn)(0,0); 
+           }
+           if (i == nOut) vex_printf( "really ok!\n" );
+         }
+      }
+
+      flip_bit ( bit );
+   }
+
+#if 0
+   assert (nOut == nIn);
+   for (i = 0; i < nOut; i++) {
+     if (inbuf[i] != outbuf[i]) {
+        vex_printf( "difference at %d !\n", i );
+        return 1;
+     }
+   }
+#endif
+
+   vex_printf( "all ok\n" );
+   (*serviceFn)(0,0);
+   /*NOTREACHED*/
+   return 0;
+}
diff --git a/memcheck/tests/origin5-bz2.stderr.exp-glibc25-amd64 b/memcheck/tests/origin5-bz2.stderr.exp-glibc25-amd64
new file mode 100644
index 0000000..b4bd290
--- /dev/null
+++ b/memcheck/tests/origin5-bz2.stderr.exp-glibc25-amd64
@@ -0,0 +1,115 @@
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin5-bz2.c:6481)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2820)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2823)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2854)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2858)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2963)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2964)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: fallbackSort (origin5-bz2.c:2269)
+   by 0x........: BZ2_blockSort (origin5-bz2.c:3116)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 8
+   at 0x........: fallbackSort (origin5-bz2.c:2275)
+   by 0x........: BZ2_blockSort (origin5-bz2.c:3116)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin5-bz2.c:6512)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
diff --git a/memcheck/tests/origin5-bz2.stderr.exp-glibc25-x86 b/memcheck/tests/origin5-bz2.stderr.exp-glibc25-x86
new file mode 100644
index 0000000..6f75ee7
--- /dev/null
+++ b/memcheck/tests/origin5-bz2.stderr.exp-glibc25-x86
@@ -0,0 +1,115 @@
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin5-bz2.c:6481)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2820)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2823)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2855)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2859)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2963)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2964)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: fallbackSort (origin5-bz2.c:2269)
+   by 0x........: BZ2_blockSort (origin5-bz2.c:3116)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Use of uninitialised value of size 4
+   at 0x........: fallbackSort (origin5-bz2.c:2275)
+   by 0x........: BZ2_blockSort (origin5-bz2.c:3116)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin5-bz2.c:6512)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6479)
diff --git a/memcheck/tests/origin5-bz2.stderr.exp-glibc27-ppc64 b/memcheck/tests/origin5-bz2.stderr.exp-glibc27-ppc64
new file mode 100644
index 0000000..09af0f7
--- /dev/null
+++ b/memcheck/tests/origin5-bz2.stderr.exp-glibc27-ppc64
@@ -0,0 +1,115 @@
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin5-bz2.c:6481)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: handle_compress (origin5-bz2.c:4686)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2820)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2823)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2854)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2858)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2963)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: BZ2_blockSort (origin5-bz2.c:2964)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: fallbackSort (origin5-bz2.c:2269)
+   by 0x........: BZ2_blockSort (origin5-bz2.c:3116)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Use of uninitialised value of size 8
+   at 0x........: fallbackSort (origin5-bz2.c:2275)
+   by 0x........: BZ2_blockSort (origin5-bz2.c:3116)
+   by 0x........: BZ2_compressBlock (origin5-bz2.c:4034)
+   by 0x........: handle_compress (origin5-bz2.c:4753)
+   by 0x........: BZ2_bzCompress (origin5-bz2.c:4822)
+   by 0x........: BZ2_bzBuffToBuffCompress (origin5-bz2.c:5630)
+   by 0x........: main (origin5-bz2.c:6484)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
+
+Conditional jump or move depends on uninitialised value(s)
+   at 0x........: main (origin5-bz2.c:6512)
+ Uninitialised value was created by a client request
+   at 0x........: main (origin5-bz2.c:6481)
diff --git a/memcheck/tests/origin5-bz2.stdout.exp b/memcheck/tests/origin5-bz2.stdout.exp
new file mode 100644
index 0000000..17364ba
--- /dev/null
+++ b/memcheck/tests/origin5-bz2.stdout.exp
@@ -0,0 +1,71 @@
+22323 bytes read
+bar
+    block 1: crc = 0xA212ABF8, combined CRC = 0xA212ABF8, size = 22373
+    too repetitive; using fallback sorting algorithm
+      22373 in block, 13504 after MTF & 1-2 coding, 79+2 syms in use
+      pass 1: size is 17143, grp uses are 38 62 2 92 6 71 
+      pass 2: size is 6506, grp uses are 28 71 0 86 9 77 
+      pass 3: size is 6479, grp uses are 26 70 0 81 11 83 
+      pass 4: size is 6469, grp uses are 26 69 0 74 17 85 
+      bytes: mapping 19, selectors 66, code lengths 134, codes 6465
+    final combined CRC = 0xA212ABF8
+   6710 after compression
+bit 0   -5  DATA_ERROR_MAGIC 
+bit 1   -5  DATA_ERROR_MAGIC 
+bit 2   -5  DATA_ERROR_MAGIC 
+bit 3   -5  DATA_ERROR_MAGIC 
+bit 4   -5  DATA_ERROR_MAGIC 
+bit 5   -5  DATA_ERROR_MAGIC 
+bit 6   -5  DATA_ERROR_MAGIC 
+bit 7   -5  DATA_ERROR_MAGIC 
+bit 8   -5  DATA_ERROR_MAGIC 
+bit 9   -5  DATA_ERROR_MAGIC 
+bit 10   -5  DATA_ERROR_MAGIC 
+bit 11   -5  DATA_ERROR_MAGIC 
+bit 12   -5  DATA_ERROR_MAGIC 
+bit 13   -5  DATA_ERROR_MAGIC 
+bit 14   -5  DATA_ERROR_MAGIC 
+bit 15   -5  DATA_ERROR_MAGIC 
+bit 16   -5  DATA_ERROR_MAGIC 
+bit 17   -5  DATA_ERROR_MAGIC 
+bit 18   -5  DATA_ERROR_MAGIC 
+bit 19   -5  DATA_ERROR_MAGIC 
+bit 20   -5  DATA_ERROR_MAGIC 
+bit 21   -5  DATA_ERROR_MAGIC 
+bit 22   -5  DATA_ERROR_MAGIC 
+bit 23   -5  DATA_ERROR_MAGIC 
+bit 24   0  OK really ok!
+bit 25   -5  DATA_ERROR_MAGIC 
+bit 26   -5  DATA_ERROR_MAGIC 
+bit 27   0  OK really ok!
+bit 28   -5  DATA_ERROR_MAGIC 
+bit 29   -5  DATA_ERROR_MAGIC 
+bit 30   -5  DATA_ERROR_MAGIC 
+bit 31   -5  DATA_ERROR_MAGIC 
+bit 32   -4  DATA_ERROR 
+bit 33   -4  DATA_ERROR 
+bit 34   -4  DATA_ERROR 
+bit 35   -4  DATA_ERROR 
+bit 2412   -4  DATA_ERROR 
+bit 4789   -4  DATA_ERROR 
+bit 7166   -4  DATA_ERROR 
+bit 9543   -4  DATA_ERROR 
+bit 11920   -4  DATA_ERROR 
+bit 14297   -4  DATA_ERROR 
+bit 16674   -4  DATA_ERROR 
+bit 19051   -4  DATA_ERROR 
+bit 21428   -4  DATA_ERROR 
+bit 23805   -4  DATA_ERROR 
+bit 26182   -4  DATA_ERROR 
+bit 28559   -4  DATA_ERROR 
+bit 30936   -4  DATA_ERROR 
+bit 33313   -4  DATA_ERROR 
+bit 35690   -4  DATA_ERROR 
+bit 38067   -4  DATA_ERROR 
+bit 40444   -4  DATA_ERROR 
+bit 42821   -4  DATA_ERROR 
+bit 45198   -4  DATA_ERROR 
+bit 47575   -4  DATA_ERROR 
+bit 49952   -4  DATA_ERROR 
+bit 52329   -4  DATA_ERROR 
+all ok
diff --git a/memcheck/tests/origin5-bz2.vgtest b/memcheck/tests/origin5-bz2.vgtest
new file mode 100644
index 0000000..5f9f573
--- /dev/null
+++ b/memcheck/tests/origin5-bz2.vgtest
@@ -0,0 +1,3 @@
+prog: origin5-bz2
+vgopts: -q --track-origins=yes
+args: x