Add exp-bbv to the tool-suite.  I'm seeing a couple of amd64-linux test
failures, but they can be fixed up in-repo.  This resolves bug 198395.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@10444 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/Makefile.am b/Makefile.am
index 5c8f043..9e99f99 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -12,7 +12,8 @@
 		helgrind \
 		drd
 
-EXP_TOOLS = 	exp-ptrcheck
+EXP_TOOLS = 	exp-ptrcheck \
+		exp-bbv
 
 # DDD: once all tools work on Darwin, TEST_TOOLS and TEST_EXP_TOOLS can be
 # replaced with TOOLS and EXP_TOOLS.
@@ -27,7 +28,7 @@
 		lackey \
 		none
 
-  TEST_EXP_TOOLS =
+  TEST_EXP_TOOLS = exp-bbv
 endif
 
 # Put docs last because building the HTML is slow and we want to get
diff --git a/NEWS b/NEWS
index 21db12c..993a17d 100644
--- a/NEWS
+++ b/NEWS
@@ -29,6 +29,8 @@
 
 * XXX: something about improved Wine support?
 
+* XXX: exp-bbv has been added...
+
 * A new Memcheck client request VALGRIND_COUNT_LEAK_BLOCKS has been added.
   It is similar to VALGRIND_COUNT_LEAKS but counts blocks instead of bytes.
   [XXX: consider adding VALGRIND_COUNT_LEAK_BYTES as a synonym and
diff --git a/configure.in b/configure.in
index a3d2201..c4f3306 100644
--- a/configure.in
+++ b/configure.in
@@ -1900,6 +1900,13 @@
    drd/docs/Makefile
    drd/scripts/download-and-build-splash2
    drd/tests/Makefile
+   exp-bbv/Makefile
+   exp-bbv/docs/Makefile
+   exp-bbv/tests/Makefile
+   exp-bbv/tests/x86/Makefile
+   exp-bbv/tests/x86-linux/Makefile
+   exp-bbv/tests/amd64-linux/Makefile
+   exp-bbv/tests/ppc32-linux/Makefile
 ])
 AC_OUTPUT
 
diff --git a/docs/xml/manual.xml b/docs/xml/manual.xml
index 727570a..53cee09 100644
--- a/docs/xml/manual.xml
+++ b/docs/xml/manual.xml
@@ -38,6 +38,8 @@
       xmlns:xi="http://www.w3.org/2001/XInclude" />
   <xi:include href="../../exp-ptrcheck/docs/pc-manual.xml" parse="xml"  
       xmlns:xi="http://www.w3.org/2001/XInclude" />
+  <xi:include href="../../exp-bbv/docs/bbv-manual.xml" parse="xml"  
+      xmlns:xi="http://www.w3.org/2001/XInclude" />      
   <xi:include href="../../none/docs/nl-manual.xml" parse="xml"  
       xmlns:xi="http://www.w3.org/2001/XInclude" />
   <xi:include href="../../lackey/docs/lk-manual.xml" parse="xml"  
diff --git a/docs/xml/valgrind-manpage.xml b/docs/xml/valgrind-manpage.xml
index e45d72a..7163eec 100644
--- a/docs/xml/valgrind-manpage.xml
+++ b/docs/xml/valgrind-manpage.xml
@@ -259,6 +259,14 @@
 
 </refsect1>
 
+<refsect1 id="bbv-options">
+<title>BBV Options</title>
+
+<xi:include href="../../exp-bbv/docs/bbv-manual.xml" 
+            xpointer="bbv.opts.list"
+            xmlns:xi="http://www.w3.org/2001/XInclude" />
+
+</refsect1>
 
 
 <refsect1 id="lackey-options">
@@ -271,7 +279,6 @@
 </refsect1>
 
 
-
 <refsect1 id="see_also">
 <title>See Also</title>
 
diff --git a/exp-bbv/Makefile.am b/exp-bbv/Makefile.am
new file mode 100644
index 0000000..367d0d4
--- /dev/null
+++ b/exp-bbv/Makefile.am
@@ -0,0 +1,37 @@
+include $(top_srcdir)/Makefile.tool.am
+
+#----------------------------------------------------------------------------
+# exp-bbv-<platform>
+#----------------------------------------------------------------------------
+
+noinst_PROGRAMS  = exp-bbv-@VGCONF_ARCH_PRI@-@VGCONF_OS@
+if VGCONF_HAVE_PLATFORM_SEC
+noinst_PROGRAMS += exp-bbv-@VGCONF_ARCH_SEC@-@VGCONF_OS@
+endif
+
+BBV_SOURCES_COMMON = bbv_main.c
+
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES      = $(BBV_SOURCES_COMMON)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CPPFLAGS     = \
+	$(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS       = \
+	$(AM_CFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_DEPENDENCIES = \
+	$(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_PRI_CAPS@)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDADD        = \
+	$(TOOL_LDADD_@VGCONF_PLATFORM_PRI_CAPS@)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDFLAGS      = \
+	$(TOOL_LDFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+if VGCONF_HAVE_PLATFORM_SEC
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_SOURCES      = $(BBV_SOURCES_COMMON)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CPPFLAGS     = \
+	$(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS       = \
+	$(AM_CFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_DEPENDENCIES = \
+	$(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_SEC_CAPS@)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDADD        = \
+	$(TOOL_LDADD_@VGCONF_PLATFORM_SEC_CAPS@)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS      = \
+	$(TOOL_LDFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+endif
diff --git a/exp-bbv/bbv_main.c b/exp-bbv/bbv_main.c
new file mode 100644
index 0000000..b5db191
--- /dev/null
+++ b/exp-bbv/bbv_main.c
@@ -0,0 +1,633 @@
+//--------------------------------------------------------------------*/
+//--- BBV: a SimPoint basic block vector generator      bbv_main.c ---*/
+//--------------------------------------------------------------------*/
+
+/*
+   This file is part of BBV, a Valgrind tool for generating SimPoint
+   basic block vectors.
+
+   Copyright (C) 2006-2009 Vince Weaver
+      vince _at_ csl.cornell.edu
+
+   pcfile code is Copyright (C) 2006-2009 Oriol Prat
+      oriol.prat _at _ bsc.es
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+
+#include "pub_tool_basics.h"
+#include "pub_tool_tooliface.h"
+#include "pub_tool_options.h"    /* command line options */
+
+#include "pub_tool_vki.h"        /* vki_stat */
+#include "pub_tool_libcbase.h"   /* VG_(strlen) */
+#include "pub_tool_libcfile.h"   /* VG_(write) */
+#include "pub_tool_libcprint.h"  /* VG_(printf) */
+#include "pub_tool_libcassert.h" /* VG_(exit) */
+#include "pub_tool_mallocfree.h" /* plain_free */
+#include "pub_tool_machine.h"    /* VG_(fnptr_to_fnentry) */
+#include "pub_tool_debuginfo.h"  /* VG_(get_fnname) */
+
+#include "pub_tool_oset.h"       /* ordered set stuff */
+
+   /* instruction special cases */
+#define REP_INSTRUCTION   0x1
+#define FLDCW_INSTRUCTION 0x2
+
+   /* interval variables */
+#define DEFAULT_GRAIN_SIZE 100000000  /* 100 million by default */
+static Int interval_size=DEFAULT_GRAIN_SIZE;
+
+   /* filenames */
+static UChar *clo_bb_out_file="bb.out.%p";
+static UChar *clo_pc_out_file="pc.out.%p";
+static UChar *pc_out_file=NULL;
+static UChar *bb_out_file=NULL;
+
+
+   /* output parameters */
+static Bool instr_count_only=False;
+static Bool generate_pc_file=False;
+
+   /* write buffer */
+static UChar buf[1024];
+
+   /* Global values */
+static OSet* instr_info_table;  /* table that holds the basic block info */
+static Int block_num=1;         /* global next block number */
+static Int current_thread=0;
+static Int allocated_threads=1;
+struct thread_info *bbv_thread=NULL;
+
+   /* Per-thread variables */
+struct thread_info {
+   ULong dyn_instr;         /* Current retired instruction count */
+   ULong total_instr;       /* Total retired instruction count   */
+   Addr last_rep_addr;      /* rep counting values */
+   ULong rep_count;
+   ULong global_rep_count;
+   ULong unique_rep_count;
+   ULong fldcw_count;       /* fldcw count */
+   Int bbtrace_fd;          /* file descriptor */
+};
+
+#define FUNCTION_NAME_LENGTH 20
+
+struct BB_info {
+   Addr       BB_addr;           /* used as key, must be first           */
+   Int        n_instrs;          /* instructions in the basic block      */
+   Int        block_num;         /* unique block identifier              */
+   Int        *inst_counter;     /* times entered * num_instructions     */
+   Bool       is_entry;          /* is this block a function entry point */
+   UChar      fn_name[FUNCTION_NAME_LENGTH];  /* Function block is in    */
+};
+
+
+   /* dump the optional PC file, which contains basic block number to */
+   /*   instruction address and function name mappings                */
+static void dumpPcFile(void)
+{
+   struct BB_info   *bb_elem;
+   Int              pctrace_fd;
+   SysRes           sres;
+
+   pc_out_file =
+          VG_(expand_file_name)("--pc-out-file", clo_pc_out_file);
+
+   sres = VG_(open)(pc_out_file, VKI_O_CREAT|VKI_O_TRUNC|VKI_O_WRONLY,
+                              VKI_S_IRUSR|VKI_S_IWUSR|VKI_S_IRGRP|VKI_S_IWGRP);
+   if (sr_isError(sres)) {
+      VG_UMSG("Error: cannot create pc file %s\n", pc_out_file);
+      VG_(exit)(1);
+   } else {
+      pctrace_fd = sr_Res(sres);
+   }
+
+      /* Loop through the table, printing the number, address, */
+      /*    and function name for each basic block             */
+   VG_(OSetGen_ResetIter)(instr_info_table);
+   while ( (bb_elem = VG_(OSetGen_Next)(instr_info_table)) ) {
+      VG_(write)(pctrace_fd,"F",1);
+      VG_(sprintf)( buf,":%d:%x:%s\n",
+                       bb_elem->block_num,
+                       (Int)bb_elem->BB_addr,
+                       bb_elem->fn_name);
+      VG_(write)(pctrace_fd, (void*)buf, VG_(strlen)(buf));
+   }
+
+   VG_(close)(pctrace_fd);
+}
+
+static Int open_tracefile(Int thread_num)
+{
+   SysRes  sres;
+   UChar temp_string[2048];
+
+      /* For thread 1, don't append any thread number  */
+      /* This lets the single-thread case not have any */
+      /* extra values appended to the file name.       */
+   if (thread_num==1) {
+      VG_(strncpy)(temp_string,bb_out_file,2047);
+   }
+   else {
+      VG_(sprintf)(temp_string,"%s.%d",bb_out_file,thread_num);
+   }
+
+   sres = VG_(open)(temp_string, VKI_O_CREAT|VKI_O_TRUNC|VKI_O_WRONLY,
+                              VKI_S_IRUSR|VKI_S_IWUSR|VKI_S_IRGRP|VKI_S_IWGRP);
+
+   if (sr_isError(sres)) {
+      VG_UMSG("Error: cannot create bb file %s\n",temp_string);
+      VG_(exit)(1);
+   }
+
+   return sr_Res(sres);
+}
+
+static void handle_overflow(void)
+{
+   struct BB_info *bb_elem;
+
+   if (bbv_thread[current_thread].dyn_instr > interval_size) {
+
+      if (!instr_count_only) {
+
+            /* If our output fd hasn't been opened, open it */
+         if (bbv_thread[current_thread].bbtrace_fd < 0) {
+            bbv_thread[current_thread].bbtrace_fd=open_tracefile(current_thread);
+         }
+
+           /* put an entry to the bb.out file */
+
+         VG_(write)(bbv_thread[current_thread].bbtrace_fd,"T",1);
+
+         VG_(OSetGen_ResetIter)(instr_info_table);
+         while ( (bb_elem = VG_(OSetGen_Next)(instr_info_table)) ) {
+            if ( bb_elem->inst_counter[current_thread] != 0 ) {
+               VG_(sprintf)( buf,":%d:%d   ",
+                         bb_elem->block_num,
+                         bb_elem->inst_counter[current_thread]);
+               VG_(write)(bbv_thread[current_thread].bbtrace_fd,
+                          (void*)buf, VG_(strlen)(buf));
+               bb_elem->inst_counter[current_thread] = 0;
+            }
+         }
+
+         VG_(write)(bbv_thread[current_thread].bbtrace_fd,"\n",1);
+      }
+
+      bbv_thread[current_thread].dyn_instr -= interval_size;
+   }
+}
+
+
+static void close_out_reps(void)
+{
+   bbv_thread[current_thread].global_rep_count+=bbv_thread[current_thread].rep_count;
+   bbv_thread[current_thread].unique_rep_count++;
+   bbv_thread[current_thread].rep_count=0;
+}
+
+   /* Generic function to get called each instruction */
+static VG_REGPARM(1) void per_instruction_BBV(struct BB_info *bbInfo)
+{
+   Int n_instrs=1;
+
+   tl_assert(bbInfo);
+
+      /* we finished rep but didn't clear out count */
+   if (bbv_thread[current_thread].rep_count) {
+      n_instrs++;
+      close_out_reps();
+   }
+
+   bbInfo->inst_counter[current_thread]+=n_instrs;
+
+   bbv_thread[current_thread].total_instr+=n_instrs;
+   bbv_thread[current_thread].dyn_instr +=n_instrs;
+
+   handle_overflow();
+}
+
+   /* Function to get called if instruction has a rep prefix */
+static VG_REGPARM(1) void per_instruction_BBV_rep(Addr addr)
+{
+      /* handle back-to-back rep instructions */
+   if (bbv_thread[current_thread].last_rep_addr!=addr) {
+      if (bbv_thread[current_thread].rep_count) {
+         close_out_reps();
+         bbv_thread[current_thread].total_instr++;
+         bbv_thread[current_thread].dyn_instr++;
+      }
+      bbv_thread[current_thread].last_rep_addr=addr;
+   }
+
+   bbv_thread[current_thread].rep_count++;
+
+}
+
+   /* Function to call if our instruction has a fldcw instruction */
+static VG_REGPARM(1) void per_instruction_BBV_fldcw(struct BB_info *bbInfo)
+{
+   Int n_instrs=1;
+
+   tl_assert(bbInfo);
+
+      /* we finished rep but didn't clear out count */
+   if (bbv_thread[current_thread].rep_count) {
+      n_instrs++;
+      close_out_reps();
+   }
+
+      /* count fldcw instructions */
+   bbv_thread[current_thread].fldcw_count++;
+
+   bbInfo->inst_counter[current_thread]+=n_instrs;
+
+   bbv_thread[current_thread].total_instr+=n_instrs;
+   bbv_thread[current_thread].dyn_instr +=n_instrs;
+
+   handle_overflow();
+}
+
+   /* Check if the instruction pointed to is one that needs */
+   /*   special handling.  If so, set a bit in the return   */
+   /*   value indicating what type.                         */
+static Int get_inst_type(Int len, Addr addr)
+{
+   int result=0;
+
+#if defined(VGA_x86) || defined(VGA_amd64)
+
+   unsigned char *inst_pointer;
+   unsigned char inst_byte;
+   int i,possible_rep;
+
+   /* rep prefixed instructions are counted as one instruction on */
+   /*     x86 processors and must be handled as a special case    */
+
+   /* Also, the rep prefix is re-used as part of the opcode for   */
+   /*     SSE instructions.  So we need to specifically check for */
+   /*     the following: movs, cmps, scas, lods, stos, ins, outs  */
+
+   inst_pointer=(unsigned char *)addr;
+   i=0;
+   inst_byte=0;
+   possible_rep=0;
+
+   while (i<len) {
+
+      inst_byte=*inst_pointer;
+
+      if ( (inst_byte == 0x67) ||            /* size override prefix */
+           (inst_byte == 0x66) ||            /* size override prefix */
+           (inst_byte == 0x48) ) {           /* 64-bit prefix */
+      } else if ( (inst_byte == 0xf2) ||     /* rep prefix    */
+                  (inst_byte == 0xf3) ) {    /* repne prefix  */
+         possible_rep=1;
+      } else {
+         break;                              /* other byte, exit */
+      }
+
+      i++;
+      inst_pointer++;
+   }
+
+   if ( possible_rep &&
+        ( ( (inst_byte >= 0xa4) &&     /* movs,cmps,scas */
+            (inst_byte <= 0xaf) ) ||   /* lods,stos      */
+          ( (inst_byte >= 0x6c) &&
+            (inst_byte <= 0x6f) ) ) ) {  /* ins,outs       */
+
+      result|=REP_INSTRUCTION;
+   }
+
+   /* fldcw instructions are double-counted by the hardware       */
+   /*     performance counters on pentium 4 processors so it is   */
+   /*     useful to have that count when doing validation work.   */
+
+   inst_pointer=(unsigned char *)addr;
+   if (len>1) {
+         /* FLDCW detection */
+         /* opcode is 0xd9/5, ie 1101 1001 oo10 1mmm */
+      if ((*inst_pointer==0xd9) &&
+          (*(inst_pointer+1)<0xb0) &&  /* need this case of fldz, etc, count */
+          ( (*(inst_pointer+1) & 0x38) == 0x28)) {
+         result|=FLDCW_INSTRUCTION;
+      }
+   }
+
+#endif
+   return result;
+}
+
+
+
+   /* Our instrumentation function       */
+   /*    sbIn = super block to translate */
+   /*    layout = guest layout           */
+   /*    gWordTy = size of guest word    */
+   /*    hWordTy = size of host word     */
+static IRSB* bbv_instrument ( VgCallbackClosure* closure,
+                             IRSB* sbIn, VexGuestLayout* layout,
+                             VexGuestExtents* vge,
+                             IRType gWordTy, IRType hWordTy )
+{
+   Int      i,n_instrs=1;
+   IRSB     *sbOut;
+   IRStmt   *st;
+   struct BB_info  *bbInfo;
+   Addr64   origAddr,ourAddr;
+   IRDirty  *di;
+   IRExpr   **argv, *arg1;
+   Int      regparms,opcode_type;
+
+      /* We don't handle a host/guest word size mismatch */
+   if (gWordTy != hWordTy) {
+      VG_(tool_panic)("host/guest word size mismatch");
+   }
+
+      /* Set up SB */
+   sbOut = deepCopyIRSBExceptStmts(sbIn);
+
+      /* Copy verbatim any IR preamble preceding the first IMark */
+   i = 0;
+   while ( (i < sbIn->stmts_used) && (sbIn->stmts[i]->tag!=Ist_IMark)) {
+      addStmtToIRSB( sbOut, sbIn->stmts[i] );
+      i++;
+   }
+
+      /* Get the first statement */
+   tl_assert(sbIn->stmts_used > 0);
+   st = sbIn->stmts[i];
+
+      /* double check we are at a Mark statement */
+   tl_assert(Ist_IMark == st->tag);
+
+   origAddr=st->Ist.IMark.addr;
+
+      /* Get the BB_info */
+   bbInfo = VG_(OSetGen_Lookup)(instr_info_table, &origAddr);
+
+   if (bbInfo==NULL) {
+
+         /* BB never translated before (at this address, at least;          */
+         /* could have been unloaded and then reloaded elsewhere in memory) */
+
+         /* allocate and initialize a new basic block structure */
+      bbInfo=VG_(OSetGen_AllocNode)(instr_info_table, sizeof(struct BB_info));
+      bbInfo->BB_addr = origAddr;
+      bbInfo->n_instrs = n_instrs;
+      bbInfo->inst_counter=VG_(calloc)("bbv_instrument",
+                                       allocated_threads,
+                                       sizeof(Int));
+
+         /* assign a unique block number */
+      bbInfo->block_num=block_num;
+      block_num++;
+         /* get function name and entry point information */
+      VG_(get_fnname)(origAddr,bbInfo->fn_name,FUNCTION_NAME_LENGTH);
+      bbInfo->is_entry=VG_(get_fnname_if_entry)(origAddr, bbInfo->fn_name,
+                                                FUNCTION_NAME_LENGTH);
+         /* insert structure into table */
+      VG_(OSetGen_Insert)( instr_info_table, bbInfo );
+   }
+
+      /* Iterate through the basic block, putting the original   */
+      /* instructions in place, plus putting a call to updateBBV */
+      /* for each original instruction                           */
+
+      /* This is less efficient than only instrumenting the BB   */
+      /* But it gives proper results given the fact that         */
+      /* valgrind uses superblocks (not basic blocks) by default */
+
+
+   while(i < sbIn->stmts_used) {
+      st=sbIn->stmts[i];
+
+      if (st->tag == Ist_IMark) {
+
+         ourAddr = st->Ist.IMark.addr;
+
+         opcode_type=get_inst_type(st->Ist.IMark.len,ourAddr);
+
+         regparms=1;
+         arg1= mkIRExpr_HWord( (HWord)bbInfo);
+         argv= mkIRExprVec_1(arg1);
+
+
+         if (opcode_type&REP_INSTRUCTION) {
+            arg1= mkIRExpr_HWord(ourAddr);
+            argv= mkIRExprVec_1(arg1);
+            di= unsafeIRDirty_0_N( regparms, "per_instruction_BBV_rep",
+                                VG_(fnptr_to_fnentry)( &per_instruction_BBV_rep ),
+                                argv);
+         }
+         else if (opcode_type&FLDCW_INSTRUCTION) {
+            di= unsafeIRDirty_0_N( regparms, "per_instruction_BBV_fldcw",
+                                VG_(fnptr_to_fnentry)( &per_instruction_BBV_fldcw ),
+                                argv);
+         }
+         else {
+         di= unsafeIRDirty_0_N( regparms, "per_instruction_BBV",
+                                VG_(fnptr_to_fnentry)( &per_instruction_BBV ),
+                                argv);
+         }
+
+
+            /* Insert our call */
+         addStmtToIRSB( sbOut,  IRStmt_Dirty(di));
+      }
+
+         /* Insert the original instruction */
+      addStmtToIRSB( sbOut, st );
+
+      i++;
+   }
+
+   return sbOut;
+}
+
+static struct thread_info *allocate_new_thread(struct thread_info *old,
+                                     Int old_number, Int new_number)
+{
+   struct thread_info *temp;
+   struct BB_info   *bb_elem;
+   Int i;
+
+   temp=VG_(realloc)("bbv_main.c allocate_threads",
+                     old,
+                     new_number*sizeof(struct thread_info));
+
+      /* init the new thread */
+      /* We loop in case the new thread is not contiguous */
+   for(i=old_number;i<new_number;i++) {
+      temp[i].last_rep_addr=0;
+      temp[i].dyn_instr=0;
+      temp[i].total_instr=0;
+      temp[i].global_rep_count=0;
+      temp[i].unique_rep_count=0;
+      temp[i].rep_count=0;
+      temp[i].fldcw_count=0;
+      temp[i].bbtrace_fd=-1;
+   }
+      /* expand the inst_counter on all allocated basic blocks */
+   VG_(OSetGen_ResetIter)(instr_info_table);
+   while ( (bb_elem = VG_(OSetGen_Next)(instr_info_table)) ) {
+      bb_elem->inst_counter =
+                    VG_(realloc)("bbv_main.c inst_counter",
+                                 bb_elem->inst_counter,
+                                 new_number*sizeof(Int));
+      for(i=old_number;i<new_number;i++) {
+         bb_elem->inst_counter[i]=0;
+      }
+   }
+
+   return temp;
+}
+
+static void bbv_thread_called ( ThreadId tid, ULong nDisp )
+{
+   if (tid >= allocated_threads) {
+      bbv_thread=allocate_new_thread(bbv_thread,allocated_threads,tid+1);
+      allocated_threads=tid+1;
+   }
+   current_thread=tid;
+}
+
+
+
+
+/*--------------------------------------------------------------------*/
+/*--- Setup                                                        ---*/
+/*--------------------------------------------------------------------*/
+
+static void bbv_post_clo_init(void)
+{
+   bb_out_file =
+          VG_(expand_file_name)("--bb-out-file", clo_bb_out_file);
+
+      /* Try a closer approximation of basic blocks  */
+      /* This is the same as the command line option */
+      /* --vex-guest-chase-thresh=0                  */
+   VG_(clo_vex_control).guest_chase_thresh = 0;
+}
+
+   /* Parse the command line options */
+static Bool bbv_process_cmd_line_option(Char* arg)
+{
+   if VG_INT_CLO       (arg, "--interval-size",    interval_size) {}
+   else if VG_STR_CLO  (arg, "--bb-out-file",      clo_bb_out_file) {}
+   else if VG_STR_CLO  (arg, "--pc-out-file",      clo_pc_out_file) {
+      generate_pc_file = True;
+   }
+   else if VG_XACT_CLO (arg, "--instr-count-only", instr_count_only, True) {}
+   else {
+      return False;
+   }
+
+   return True;
+}
+
+static void bbv_print_usage(void)
+{
+   VG_(printf) ("   --bb-out-file=<file>  filename for basic block vector info\n");
+   VG_(printf) ("   --pc-out-file=<file>  filename for basic block addresses and function names\n");
+   VG_(printf) ("   --interval-size=<num> interval size\n");
+   VG_(printf) ("   --instr-count-only    only print total instruction count\n");
+}
+
+static void bbv_print_debug_usage(void)
+{
+   VG_(printf)("    (none)\n");
+}
+
+static void bbv_fini(Int exitcode)
+{
+   Int i;
+
+   if (generate_pc_file) {
+      dumpPcFile();
+   }
+
+   for(i=0;i<allocated_threads;i++) {
+
+      if (bbv_thread[i].total_instr!=0) {
+
+         VG_(sprintf)(buf,"\n\n"
+                          "# Thread %d\n"
+                          "#   Total intervals: %d (Interval Size %d)\n"
+                          "#   Total instructions: %lld\n"
+                          "#   Total reps: %lld\n"
+                          "#   Unique reps: %lld\n"
+                          "#   Total fldcw instructions: %lld\n\n",
+                i,
+                (Int)(bbv_thread[i].total_instr/(ULong)interval_size),
+                interval_size,
+                bbv_thread[i].total_instr,
+                bbv_thread[i].global_rep_count,
+                bbv_thread[i].unique_rep_count,
+                bbv_thread[i].fldcw_count);
+
+            /* Print results to display */
+         VG_UMSG("%s", buf);
+
+            /* open the output file if it hasn't already */
+         if (bbv_thread[i].bbtrace_fd < 0) {
+            bbv_thread[i].bbtrace_fd=open_tracefile(i);
+         }
+            /* Also print to results file */
+         VG_(write)(bbv_thread[i].bbtrace_fd,(void*)buf,VG_(strlen)(buf));
+         VG_(close)(bbv_thread[i].bbtrace_fd);
+      }
+   }
+}
+
+static void bbv_pre_clo_init(void)
+{
+   VG_(details_name)            ("exp-bbv");
+   VG_(details_version)         (NULL);
+   VG_(details_description)     ("a SimPoint basic block vector generator");
+   VG_(details_copyright_author)(
+      "Copyright (C) 2006-2009 Vince Weaver");
+   VG_(details_bug_reports_to)  (VG_BUGS_TO);
+
+   VG_(basic_tool_funcs)          (bbv_post_clo_init,
+                                   bbv_instrument,
+                                   bbv_fini);
+
+   VG_(needs_command_line_options)(bbv_process_cmd_line_option,
+                                   bbv_print_usage,
+                                   bbv_print_debug_usage);
+
+   VG_(track_start_client_code)( bbv_thread_called );
+
+
+   instr_info_table = VG_(OSetGen_Create)(/*keyOff*/0,
+                                          NULL,
+                                          VG_(malloc), "bbv.1", VG_(free));
+
+   bbv_thread=allocate_new_thread(bbv_thread,0,allocated_threads);
+}
+
+VG_DETERMINE_INTERFACE_VERSION(bbv_pre_clo_init)
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                          ---*/
+/*--------------------------------------------------------------------*/
diff --git a/exp-bbv/docs/Makefile.am b/exp-bbv/docs/Makefile.am
new file mode 100644
index 0000000..734dc54
--- /dev/null
+++ b/exp-bbv/docs/Makefile.am
@@ -0,0 +1,2 @@
+EXTRA_DIST = bbv-manual.xml
+
diff --git a/exp-bbv/docs/bbv-manual.xml b/exp-bbv/docs/bbv-manual.xml
new file mode 100644
index 0000000..a699a5f
--- /dev/null
+++ b/exp-bbv/docs/bbv-manual.xml
@@ -0,0 +1,345 @@
+<?xml version="1.0"?> <!-- -*- sgml -*- -->
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
+  "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+
+<chapter id="bbv-manual" xreflabel="BBV">
+  <title>BBV: a Basic Block Vector generation tool</title>
+
+<para>To use this tool, you must specify
+<computeroutput>--tool=exp-bbv</computeroutput> on the Valgrind
+command line.</para>
+
+<sect1 id="bbv-manual.background" xreflabel="BBV Background">
+<title>Basic Block Profiling and SimPoint</title>
+
+<para>
+   A Basic Blocks Vector (BBV) is a list of all basic blocks entered
+   during program execution, and a count of how many times each
+   block was run (a basic block is a section of code
+   with only one entry point and one exit point).
+</para>
+
+<para>
+   This tool was written to generate basic block vectors
+   for use with the SimPoint analysis tool 
+   (http://www.cse.ucsd.edu/~calder/simpoint/).
+   The SimPoint methodology enables speeding up architectural 
+   simulations by only running a small portion of a program
+   and then extrapolating total behavior from this
+   small portion.  Most programs exhibit phase-based behavior, which
+   means that at various times during execution a program will encounter 
+   intervals of time where the code behaves similarly to a previous
+   interval.  If you can detect these intervals and group them together, 
+   an approximation of the total program behavior can be obtained
+   by only simulating a bare minimum number of intervals, and then scaling 
+   the results.
+</para>
+
+<para>
+  In computer architecture research, running a 
+  benchmark on a cycle-accurate simulator can cause slowdowns on the order
+  of 1000 times, making it take days, weeks, or even longer to run full
+  benchmarks.  By utilizing SimPoint this can be reduced significantly
+  while still retaining reasonable accuracy, usually in the 5-10% range.
+</para>
+
+<para>
+   A more complete introduction to how SimPoint works can be 
+   found in the paper "Automatically Characterizing Large Scale 
+   Program Behavior" by T. Sherwood, E Perelman, G. Hamerly, and 
+   B. Calder.  
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.quickstart" xreflabel="Quick Start">
+<title>Using Basic Block Vectors to create SimPoints</title>
+
+<para>
+   To quickly create a basic block vector file, you will call Valgrind
+   like this:
+   <computeroutput>valgrind --tool=exp-bbv /bin/ls</computeroutput>
+   In this case we are running on the "ls" program, but this
+   can be any executable.  By default a file called 
+   <computeroutput>bb.out.PID</computeroutput> will be created,
+   where PID is replaced by the process ID of the running process.
+   This file is the basic block vector.  For long-running programs
+   this file can be quite large, so it might be wise to compress
+   it with gzip or some other compression program.
+</para>   
+
+<para>
+   To create actual SimPoint results, you will need the
+   SimPoint utility, available from the SimPoint webpage
+   (http://www.cse.ucsd.edu/~calder/simpoint/).
+   Assuming you have downloaded SimPoint 3.2 and compiled it,
+   create SimPoint results with a command like the following:
+      
+   <computeroutput>./SimPoint.3.2/bin/simpoint -inputVectorsGzipped \
+           -loadFVFile bb.out.1234.gz \
+	   -k 5 -saveSimpoints results.simpts \
+	   -saveSimpointWeights results.weights
+   </computeroutput>
+   where bb.out.1234.gz is your compressed basic block vector file
+   generated by Valgrind exp-bbv.
+</para>
+
+<para>   
+   The SimPoint utility does random linear projection using 15-dimensions,
+   then does k-mean clustering to calculate which intervals are 
+   of interest.  In this example we specify 5 intervals with the 
+   -k 5 option.   
+</para>   
+   
+<para>   
+   The outputs from the SimPoint run are the 
+   <computeroutput>results.simpts</computeroutput>
+   and <computeroutput>results.weights</computeroutput> files.
+   The first holds the 5 most relevant intervals of the program.
+   The seconds holds the weight to scale each interval by when
+   extrapolating full-program behavior.  The intervals and the weights
+   can be used in conjunction with a simulator that supports
+   fast-forwarding; you fast-forward to the interval of interest,
+   collect stats for the desired interval length, then use
+   statistics gathered in conjunction with the weights to 
+   calculate your results.
+</para> 
+   
+</sect1>
+
+<sect1 id="bbv-manual.usage" xreflabel="BBV Usage">
+<title>BBV Command Line Options</title>
+
+<para>
+   BBV has various options that control the behavior of the plugin:
+<!-- start of xi:include in the manpage -->
+<variablelist id="bbv.opts.list">
+
+  <varlistentry id="opt.interval-size" xreflabel="--interval-size">
+      <term>
+        <option><![CDATA[--interval-size=<number> [default: 100000000] ]]></option>
+      </term>
+      <listitem>
+      <para>
+         This option selects the size of the interval to use.  
+         The default is 100 
+         million instructions, which is a commonly used value.  
+         Other sizes can be used; smaller intervals can help programs
+         with finer-grained phases.  However smaller interval size
+         can lead to accuracy issues due to warm-up effects 
+         (When fast-forwarding the various architectural features
+         will be un-initialized, and it will take some number
+         of instructions before they "warm up" to the state a 
+         full simulation would be at without the fast-forwarding.
+         Large interval sizes tend to mitigate this.)
+      </para>
+      </listitem>
+  </varlistentry>
+
+  <varlistentry id="opt.instr-count-only" xreflabel="--instr-count-only">
+     <term>
+        <option><![CDATA[--instr-count-only [default: no] ]]></option>
+     </term>
+     <listitem>
+        <para>
+           This option tells the tool to only display instruction 
+           count totals, and to not generate the
+           actual BBV file.  This is useful for debugging, and for
+           gathering instruction count info without generating
+           the large BBV files.
+        </para>
+     </listitem>
+   </varlistentry>
+  
+  <varlistentry id="opt.bb-out-file" xreflabel="--bb-out-file">
+     <term>
+        <option><![CDATA[--bb-out-file=<name> [default: bb.out.%p] ]]></option>
+     </term>
+     <listitem>
+        <para>
+           This option selects the name of the basic block file.  Default is 
+           bb.out.%p.   The
+           <option>%p</option> and <option>%q</option> format specifiers can be
+           used to embed the process ID and/or the contents of an environment
+           variable in the name, as is the case for the core option
+           <option>--log-file</option>.
+        </para>
+     </listitem>
+  </varlistentry>
+
+  <varlistentry id="opt.pc-out-file" xreflabel="--pc-out-file">
+     <term>
+        <option><![CDATA[--pc-out-file=<name> [default: pc.out.%p] ]]></option>
+     </term>
+     <listitem>
+        <para>
+           This option selects the name of the PC file.  
+           This file holds program counter addresses
+           and function name info for the various basic blocks.
+           This can be used in conjunction
+           with the bbv file to fast-forward via function names
+           instead of just instruction counts.
+	   The default filename is pc.out.%p.
+           <option>%p</option> and <option>%q</option> format specifiers can be
+           used to embed the process ID and/or the contents of an environment
+           variable in the name, as is the case for the core option
+           <option>--log-file</option>.
+
+        </para>
+     </listitem>
+   </varlistentry>
+</variablelist>
+<!-- end of xi:include in the manpage -->
+
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.fileformat" xreflabel="BBV File Format">
+<title>Basic Block Vector File Format</title>
+
+<para>  
+  The Basic Block Vector is dumped at fixed intervals.  This
+  is commonly done every 100 million instructions; the 
+  <computeroutput>--interval-size</computeroutput> option can be 
+  used to change this.
+</para>
+
+<para>
+  The output file looks like this:
+</para>
+
+<programlisting><![CDATA[
+T:45:1024 :189:99343
+T:11:78573 :15:1353  :56:1
+T:18:45 :12:135353 :56:78 314:4324263]]></programlisting>
+
+<para>
+  Each new interval starts with a T.   This is followed by a colon,
+  then by a unique number identifying the basic block.  This is followed
+  by another colon, then followed by the frequency (which is scaled
+  by the number of instructions in the basic block).
+</para>
+
+<para>
+  The entry count is multiplied by the number of instructions that are 
+  in the basic block, in order to weigh the count so that instructions in 
+  small Basic Blocks aren't counted as more important than instructions 
+  in large Basic Blocks.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.implementation" xreflabel="Implementation">
+<title>Implementation</title>
+
+<para>
+   Valgrind provides all of the information necessary to create
+   BBV files.  In the current implementation, all instructions
+   are instrumented.  This is slower (by approximately a factor
+   of two) than a method that instruments at the basic-block level, 
+   but there are some complications (especially with rep prefix
+   detection) that make that method more difficult.
+</para>
+  
+<para>
+   Valgrind actually provides instrumentation at a super-block level.
+   A super-block has one entry point but unlike basic-blocks can
+   have multiple exit points.  Once a branch occurs into the middle
+   of a block, it is split into a new basic-block.  Because
+   Valgrind cannot produce "true" basic blocks, the generated
+   BBV vectors will be different than those generated by other tools.
+   In practice this does not seem to affect the accuracy of the
+   SimPoint results.  We do internally force the
+   <computeroutput>--vex-guest-chase-thresh=0</computeroutput>
+   option to Valgrind which forces a more basic-block like
+   behavior.
+</para>
+
+<para>
+   When a super block is run for the first time, it is instrumented
+   with our BBV routine.  This adds a call to our instruction
+   counting function for each original instruction.
+   The current superblock is looked up in an Ordered Set to find 
+   a structure that holds block-specific statistics (the entry point 
+   address is the index into the hash table).  We increment the 
+   instruction count for this superblock and
+   also update the master instruction count.
+   If the master count overflows the interval size 
+   then we print out the basic block statistics for the current interval
+   to disk, and then reset all the superblock counters to zero.
+</para>
+
+<para>
+   On the x86 and amd64 architectures the code takes special
+   care with rep-prefixed string instructions.  This is because 
+   actual hardware counts a rep-prefixed instruction 
+   as one instruction, while a naive Valgrind implementation
+   would count it as many (possibly hundreds, thousands or even millions)
+   of instructions.  We have special code to handle
+   this properly, which makes the results match hardware performance
+   counter results.
+</para>   
+   
+<para>
+   The exp-bbv tool also counts the fldcw instruction.  This
+   instruction is used on x86 machines when converting numbers
+   from floating point to integer (among other uses).
+   On Pentium 4 systems the retired instruction performance
+   counter counts this instruction as two
+   instructions (all other known processors only count it as one).
+   This can affect results when using SimPoint on Pentium 4 systems,
+   so we provide the count for use in mitigating this at analysis time.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.threadsupport" xreflabel="BBV Threaded Support">
+<title>Threaded Executable Support</title>
+
+<para>
+   BBV supports threaded programs.  When a program has multiple threads,
+   an additional BBV file is created for each thread (each additional
+   file is the specified filename with the thread number
+   appended at the end).
+</para>
+
+<para>
+   There is no official method of using SimPoint with
+   threaded workloads.  The most common method is to run
+   SimPoint on each thread's results independently, and use 
+   some method of deterministic execution to try to match the
+   original workload.  This should be possible with current
+   exp-bbv.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.validation" xreflabel="BBV Validation">
+<title>Validation</title>
+
+<para>
+   This plugin has been tested on x86, amd64, and ppc32 platforms.
+   An earlier version of the plugin was tested in detail using
+   hardware performance counters, this work is described in a paper 
+   from the HiPEAC'08 conference, "Using Dynamic Binary Instrumentation 
+   to Generate Multi-Platform SimPoints: Methodology and Accuracy" by
+   V.M. Weaver and S.A. McKee.
+</para>
+ 
+</sect1>
+ 
+<sect1 id="bbv-manual.performance" xreflabel="BBV Performance">
+<title>Performance</title>
+
+<para>
+  Using this program slows down execution by roughly a factor of 40
+  over native execution.  This varies depending on the machine
+  used and the benchmark being run.
+  On the SPEC CPU 2000 benchmarks running on a 3.4GHz Pentium D 
+  processor, the slowdown ranges from 24x (mcf) to 340x (vortex.2).
+</para>
+
+</sect1>
+
+</chapter>
diff --git a/exp-bbv/tests/Makefile.am b/exp-bbv/tests/Makefile.am
new file mode 100644
index 0000000..efd5914
--- /dev/null
+++ b/exp-bbv/tests/Makefile.am
@@ -0,0 +1,29 @@
+
+include $(top_srcdir)/Makefile.tool-tests.am
+
+SUBDIRS = .
+
+# Platform-specific tests
+if VGCONF_ARCHS_INCLUDE_X86
+SUBDIRS += x86
+endif
+if VGCONF_PLATFORMS_INCLUDE_X86_LINUX
+SUBDIRS += x86-linux
+endif
+if VGCONF_PLATFORMS_INCLUDE_AMD64_LINUX
+SUBDIRS += amd64-linux
+endif
+if VGCONF_PLATFORMS_INCLUDE_PPC32_LINUX
+SUBDIRS += ppc32-linux
+endif
+
+DIST_SUBDIRS = x86 x86-linux amd64-linux ppc32-linux .
+
+EXTRA_DIST = \
+	   logo.include logo.lzss_new
+	
+check_PROGRAMS = 
+	
+AM_CFLAGS   += $(AM_FLAG_M3264_PRI)
+AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
+
diff --git a/exp-bbv/tests/amd64-linux/Makefile.am b/exp-bbv/tests/amd64-linux/Makefile.am
new file mode 100644
index 0000000..65ef300
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/Makefile.am
@@ -0,0 +1,36 @@
+include $(top_srcdir)/Makefile.tool-tests.am
+
+dist_noinst_SCRIPTS = filter_stderr
+
+check_PROGRAMS = \
+	million rep_prefix ll fldcw_check complex_rep clone_test
+
+EXTRA_DIST = \
+	   clone_test.stderr.exp \
+	   clone_test.post.exp \
+	   clone_test.vgtest \
+	   complex_rep.stderr.exp \
+	   complex_rep.vgtest \
+	   fldcw_check.stderr.exp \
+	   fldcw_check.vgtest \
+	   ll.stderr.exp \
+	   ll.stdout.exp \
+	   ll.post.exp \
+	   ll.vgtest \
+	   million.stderr.exp \
+	   million.post.exp \
+	   million.vgtest \
+	   rep_prefix.stderr.exp \
+	   rep_prefix.vgtest 
+
+AM_CCASFLAGS += -ffreestanding
+
+LDFLAGS += -nostartfiles -nodefaultlibs
+
+clone_test = clone_test.S
+complex_rep_SOURCES = complex_rep.S
+fldcw_check_SOURCES = fldcw_check.S
+ll_SOURCES = ll.S
+million_SOURCES = million.S
+rep_prefix_SOURCES = rep_prefix.S
+
diff --git a/exp-bbv/tests/amd64-linux/clone_test.S b/exp-bbv/tests/amd64-linux/clone_test.S
new file mode 100644
index 0000000..10a2df3
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/clone_test.S
@@ -0,0 +1,95 @@
+     	     # count for ~1 million instructions thread 1
+	     # count for ~2 million instructions thread 2
+	     # count for additional 500 million each before exit
+	     
+	.globl _start	
+_start:	
+
+	#################################################
+        # 1000 cycles in initial thread                 #
+	#################################################
+	
+	xor	%rax,%rax
+	mov	$499,%rcx		# load counter
+initial_loop:	
+	dec	%rcx			# repeat count times
+	jnz	initial_loop
+
+
+	#####################################################
+	# Spawn a thread!                                   #
+	#####################################################
+clone:
+	mov    $56,%rax			# clone syscall
+	
+	# Note, clone syscall is different than the glibc implementation
+	
+# 	int clone (flags, stack_pointer,parent_tidptr,child_tidptr,tls)
+
+
+	       				# Flags in 
+	       				#/usr/include/bits/sched.h
+					# CLONE_THREAD 0x10000
+					# CLONE_SIGHAND 0x800
+					# CLONE_VM      0x100
+					# above must be called together
+					# Below required for Valgrind
+					# CLONE_FS	 0x200
+					# CLONE_FILES	 0x400
+
+	mov    $0x10f00,%rdi		
+	
+
+	mov    $(new_stack+4096),%rsi	 	 	# new stack
+
+	
+
+	mov    $0,%rdx		# args (none)
+
+	syscall
+	
+	cmp   $0,%rax		# are we in new thread?
+	jz    thread2		# if so, jump to thrad2
+
+
+	###############################################
+	# thread1                                     #
+	###############################################
+
+thread1:
+
+	mov	$499997,%rcx		# load counter
+thread1_loop:	
+	dec	%rcx			# repeat count times
+	jnz	thread1_loop
+
+	xor     %rdi,%rdi		# we return 0
+	jmp    exit
+	
+thread2:	
+	mov	$999997,%rcx		# load counter
+thread2_loop:	
+	dec	%rcx			# repeat count times
+	jnz	thread2_loop	
+	
+	mov    $5,%rdi			# we return 5
+	
+	
+	#================================
+	# Exit
+	#================================
+exit:
+
+     	# count an additional 500 million
+
+	mov	$250000,%rcx		# load counter
+exit_loop:	
+	dec	%rcx			# repeat count times
+	jnz	exit_loop	
+
+actual_exit:
+	mov	$60,%rax		# put exit syscall number (60) in rax
+	syscall
+
+.bss
+.lcomm	new_stack,4096
diff --git a/exp-bbv/tests/amd64-linux/clone_test.post.exp b/exp-bbv/tests/amd64-linux/clone_test.post.exp
new file mode 100644
index 0000000..55bcf61
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/clone_test.post.exp
@@ -0,0 +1,58 @@
+T 4    996    5    2    3    98991   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 1001    2    3    98994   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+
+
+# Thread 1
+#   Total intervals: 15 (Interval Size 100000)
+#   Total instructions: 1501007
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
+T 2    3    99996   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 99996    4   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 99998    2   
+
+
+# Thread 2
+#   Total intervals: 25 (Interval Size 100000)
+#   Total instructions: 2500001
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
diff --git a/exp-bbv/tests/amd64-linux/clone_test.stderr.exp b/exp-bbv/tests/amd64-linux/clone_test.stderr.exp
new file mode 100644
index 0000000..6a917a2
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/clone_test.stderr.exp
@@ -0,0 +1,12 @@
+# Thread 1
+#   Total intervals: 15 (Interval Size 100000)
+#   Total instructions: 1501007
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+# Thread 2
+#   Total intervals: 25 (Interval Size 100000)
+#   Total instructions: 2500001
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/amd64-linux/clone_test.vgtest b/exp-bbv/tests/amd64-linux/clone_test.vgtest
new file mode 100644
index 0000000..9f5cd4d
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/clone_test.vgtest
@@ -0,0 +1,5 @@
+prog: clone_test
+vgopts: --interval-size=100000 --bb-out-file=clone_test.out.bb --pc-out-file=clone_test.out.pc
+post:	cat clone_test.out.bb clone_test.out.bb.2 | ../filter_bb
+cleanup: rm clone_test.out.bb
+
diff --git a/exp-bbv/tests/amd64-linux/complex_rep.S b/exp-bbv/tests/amd64-linux/complex_rep.S
new file mode 100644
index 0000000..80b8c8c
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/complex_rep.S
@@ -0,0 +1,58 @@
+# When trying (and failing) to instrument at the basic block level
+# I thought up a lot of corner-cases in the rep code.  This tries
+# to catch some of them
+
+# Performance counters give us 8207 insns
+#    11 + 8*1024 + 3 = 8206
+
+	.globl _start	
+_start:	
+	cld				# we want these to happen forward
+
+	mov    $0xfeb131978,%rax	# value to store
+
+	# test back-to-back rep/stosb's
+
+	mov	$1024,%rcx
+	mov	$buffer1, %rdi		# set destination
+	rep	stosb	    		# store 1024 times
+	rep	stosb	    		# should store 0 times	
+	rep	stosb			# should store 0 times
+
+	
+	# test stosb where cx is 0
+	
+	xor    %rcx,%rcx
+	mov    $buffer1, %rdi		# set destination
+	rep    stosb	  		# should not load at all
+	
+	# test rep inside of a loop
+	
+	mov    $1024, %rbx
+rep_loop:	
+
+	mov    $1024,%rcx
+	mov    $buffer1, %rdi		# set destination
+	rep    stosb
+	
+	mov    $1024,%rcx
+	mov    $buffer1, %rdi		# set destination
+	rep    stosb
+
+	dec    %rbx
+	jnz    rep_loop
+	
+	
+	#================================
+	# Exit
+	#================================
+exit:
+     	mov	$60,%rax
+	xor     %rdi,%rdi		# we return 0
+	syscall             		# and exit
+
+
+.bss
+
+.lcomm	buffer1,	16384
+
diff --git a/exp-bbv/tests/amd64-linux/complex_rep.stderr.exp b/exp-bbv/tests/amd64-linux/complex_rep.stderr.exp
new file mode 100644
index 0000000..ceabe14
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/complex_rep.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 0 (Interval Size 100000)
+#   Total instructions: 8206
+#   Total reps: 2100228
+#   Unique reps: 2052
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/amd64-linux/complex_rep.vgtest b/exp-bbv/tests/amd64-linux/complex_rep.vgtest
new file mode 100644
index 0000000..ef5ac30
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/complex_rep.vgtest
@@ -0,0 +1,4 @@
+prog: complex_rep
+vgopts: --interval-size=100000 --bb-out-file=complex_rep.out.bb
+cleanup: rm complex_rep.out.bb
+
diff --git a/exp-bbv/tests/amd64-linux/filter_stderr b/exp-bbv/tests/amd64-linux/filter_stderr
new file mode 100644
index 0000000..616ce05
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/filter_stderr
@@ -0,0 +1,4 @@
+#! /bin/sh
+
+../filter_stderr
+
diff --git a/exp-bbv/tests/amd64-linux/fldcw_check.S b/exp-bbv/tests/amd64-linux/fldcw_check.S
new file mode 100644
index 0000000..cfca2d0
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/fldcw_check.S
@@ -0,0 +1,129 @@
+
+.globl _start
+
+_start:
+        # This code tests for the fldcw "load floating point command word"
+	#   instruction.  On most x86 processors the retired_instruction
+	#   performance counter counts this as one instruction.  However,
+	#   on Pentium 4 systems it counts as two.  Therefore this can
+	#   affect BBV results on such a system.
+	# fldcw is most often used to set the rouding mode when doing
+	#   floating point to integer conversions
+	
+	# It is encoded as "d9 /5" which means
+	#   1101 1001 xx10 1yyy
+	# Where xx is the "mod" which will be 00, 01, or 10 indicating offset
+	#   and yyy is the register field
+
+        # these are instructions with similar encodings to fldcw
+	# that can cause false positives if the test isn't explicit enough
+similar:	
+        fld1   	   	       		# d9 e8
+	fldl2t				# d9 e9
+	fldl2e				# d9 ea
+	fldpi				# d9 eb
+	fldlg2				# d9 ec
+	fldln2				# d9 ed
+	fldz				# d9 ee
+
+	# check some varied ways of calling fldcw
+
+	# offset on stack
+stack:	
+	sub	$8,%rsp			# allocate space on stack
+	fnstcw	2(%rsp)		
+	fldcw	2(%rsp)		
+	add	$8,%rsp			# restore stack
+
+	# 64-bit register
+sixtyfour_reg:	
+	fnstcw	cw
+	mov	$cw,%rax
+	fldcw	0(%rax)			# rax
+	mov	$cw,%rbx
+	fldcw	0(%rbx)			# rbx
+	mov	$cw,%rcx	
+	fldcw	0(%rcx)			# rcx
+	mov	$cw,%rdx		 
+	fldcw	0(%rdx)			# rdx
+
+	# 32-bit register
+thirtytwo_reg:	
+	fnstcw	cw
+	mov	$cw,%eax
+	fldcw	0(%eax)			# eax
+	mov	$cw,%ebx
+	fldcw	0(%ebx)			# ebx
+	mov	$cw,%ecx	
+	fldcw	0(%ecx)			# ecx
+	mov	$cw,%edx		 
+	fldcw	0(%edx)			# edx
+	
+	# register + 8-bit offset
+eight_bit:	
+	mov	$cw,%eax
+	sub	$32,%eax
+	
+	fldcw	32(%eax)		# eax + 8 bit offset
+	mov	%eax,%ebx
+	fldcw	32(%ebx)		# ebx + 8 bit offset	
+	mov	%eax,%ecx
+	fldcw	32(%ecx)		# ecx + 8 bit offset		
+	mov	%eax,%edx
+	fldcw	32(%edx)		# edx + 8 bit offset
+	
+	# register + 32-bit offset
+thirtytwo_bit:	
+	mov	$cw,%eax
+	sub	$30000,%eax
+	
+	fldcw	30000(%eax)		# eax + 16 bit offset
+	mov	%eax,%ebx
+	fldcw	30000(%ebx)		# ebx + 16 bit offset	
+	mov	%eax,%ecx
+	fldcw	30000(%ecx)		# ecx + 16 bit offset		
+	mov	%eax,%edx
+	fldcw	30000(%edx)		# edx + 16 bit offset			
+
+	# check an fp/integer conversion
+	# in a loop to give a bigger count
+
+	mov	$1024,%rcx
+big_loop:
+
+	fldl	three			# load value onto fp stack
+	fnstcw	saved_cw		# store control word to mem
+	movzwl	saved_cw, %eax		# load cw from mem, zero extending
+	movb	$12, %ah		# set cw for "round to zero"
+	movw	%rax, cw		# store back to memory
+	fldcw	cw   			# save new rounding mode
+	fistpl	result			# save stack value as integer to mem
+	fldcw	saved_cw		# restore old cw
+	
+	loop	big_loop		# loop to make the count more obvious
+
+	movl	result, %ebx		# sanity check to see if the
+	cmp	$3,%rbx			# result is the expected one
+	je	exit
+	
+print_error:
+	mov 	$1,%rax			# write syscall
+	mov	$1,%rdi			# stdout
+	mov	$error,%rsi		# string	
+	mov 	$22,%rdx		# length of string
+	syscall
+	
+exit:	
+	xor	%rdi, %rdi		# return 0
+	mov	$60, %rax		# SYSCALL_EXIT
+	syscall
+	
+
+
+.data
+saved_cw:	.long 0
+cw:  	.long	0
+result: .long	0
+three:	.long	0			# a floating point 3.0
+	.long	1074266112
+error:	.asciz  "Error!  Wrong result!\n"
diff --git a/exp-bbv/tests/amd64-linux/fldcw_check.stderr.exp b/exp-bbv/tests/amd64-linux/fldcw_check.stderr.exp
new file mode 100644
index 0000000..9e7d33d
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/fldcw_check.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 0 (Interval Size 10000)
+#   Total instructions: 9270
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 2053
diff --git a/exp-bbv/tests/amd64-linux/fldcw_check.vgtest b/exp-bbv/tests/amd64-linux/fldcw_check.vgtest
new file mode 100644
index 0000000..f9bbae9
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/fldcw_check.vgtest
@@ -0,0 +1,4 @@
+prog: fldcw_check
+vgopts: --interval-size=10000 --bb-out-file=fldcw_check.out.bb
+cleanup: rm fldcw_check.out.bb
+
diff --git a/exp-bbv/tests/amd64-linux/ll.S b/exp-bbv/tests/amd64-linux/ll.S
new file mode 100644
index 0000000..95c5d3b
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/ll.S
@@ -0,0 +1,631 @@
+#
+#  linux_logo in x86_64 assembly language
+#    based on the code from ll_asm-0.36
+#
+#  By Vince Weaver <vince _at_ deater.net>
+#
+# Modified to remove non-deterministic system calls
+# And to avoid reading from /proc
+#
+
+		
+.include "../logo.include"
+
+# offsets into the results returned by the uname syscall
+.equ U_SYSNAME,0
+.equ U_NODENAME,65
+.equ U_RELEASE,65*2
+.equ U_VERSION,(65*3)
+.equ U_MACHINE,(65*4)
+.equ U_DOMAINNAME,65*5
+
+# offset into the results returned by the sysinfo syscall
+.equ S_TOTALRAM,32
+
+# Sycscalls
+.equ SYSCALL_EXIT,    60
+.equ SYSCALL_READ,     0
+.equ SYSCALL_WRITE,    1
+.equ SYSCALL_OPEN,     2
+.equ SYSCALL_CLOSE,    3
+.equ SYSCALL_SYSINFO, 99
+.equ SYSCALL_UNAME,   63
+
+#
+.equ STDIN,0
+.equ STDOUT,1
+.equ STDERR,2
+
+	.globl _start	
+_start:	
+	#=========================
+	# PRINT LOGO
+	#=========================
+
+# LZSS decompression algorithm implementation
+# by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989
+# optimized some more by Vince Weaver
+
+	# we used to fill the buffer with FREQUENT_CHAR
+	# but, that only gains us one byte of space in the lzss image.
+	# the lzss algorithm does automatic RLE... pretty clever
+	# so we compress with NUL as FREQUENT_CHAR and it is pre-done for us
+
+	mov     $(N-F), %ebp   	     	# R
+
+	mov  	$logo, %esi		# %esi points to logo (for lodsb)
+
+	mov	$out_buffer, %edi	# point to out_buffer
+	push	%rdi	     		# save this value for later
+
+	xor	%ecx, %ecx
+
+decompression_loop:	
+	lodsb			# load in a byte
+
+	mov 	$0xff, %bh	# re-load top as a hackish 8-bit counter
+	mov 	%al, %bl	# move in the flags
+
+test_flags:
+	cmp	$logo_end, %esi # have we reached the end?
+	je	done_logo  	# ! if so, exit
+
+	shr 	$1, %ebx	# shift bottom bit into carry flag
+	jc	discrete_char	# ! if set, we jump to discrete char
+
+offset_length:
+	lodsw                   # get match_length and match_position
+	mov %eax,%edx		# copy to edx
+	    			# no need to mask dx, as we do it
+				# by default in output_loop
+	
+	shr $(P_BITS),%eax	
+	add $(THRESHOLD+1),%al
+	mov %al,%cl             # cl = (ax >> P_BITS) + THRESHOLD + 1
+				  #                       (=match_length)
+		
+output_loop:
+	and 	$POSITION_MASK,%dh  	# mask it
+	mov 	text_buf(%rdx), %al	# load byte from text_buf[]
+	inc 	%edx	    		# advance pointer in text_buf
+store_byte:	
+	stosb				# store it
+	
+	mov     %al, text_buf(%rbp)	# store also to text_buf[r]
+	inc 	%ebp 			# r++
+	and 	$(N-1), %bp		# mask r
+
+	loop 	output_loop		# repeat until k>j
+	
+	or	%bh,%bh			# ! if 0 we shifted through 8 and must
+	jnz	test_flags		# re-load flags
+	
+	jmp 	decompression_loop
+
+discrete_char:
+	lodsb				# load a byte
+	inc	%ecx			# we set ecx to one so byte
+					# will be output once
+					# (how do we know ecx is zero?)
+					
+	jmp     store_byte              # and cleverly store it
+
+
+# end of LZSS code
+
+done_logo:
+
+	pop 	%rbp			# get out_buffer and keep in bp
+	mov	%ebp,%ecx		# move out_buffer to ecx
+
+	call	write_stdout		# print the logo
+
+	#
+	#  Setup
+	#
+setup:
+	mov	$strcat,%edx		# use rdx as call pointer (smaller op)
+
+	
+	#==========================
+	# PRINT VERSION
+	#==========================
+	
+#	push 	$SYSCALL_UNAME		# uname syscall
+#	pop	%rax			# in 3 bytes	
+	mov	$uname_info,%edi	# uname struct (0 extend address)
+#	syscall				# do syscall
+
+	mov	%ebp,%edi		# point %edi to out_buffer
+		
+	mov	$(uname_info+U_SYSNAME),%esi	# os-name from uname "Linux"
+	call	*%rdx			# call strcat
+
+	mov	$ver_string,%esi		# source is " Version "
+	call 	*%rdx			        # call strcat
+	push	%rsi  				# save our .txt pointer
+	
+	mov	$(uname_info+U_RELEASE),%esi    # version from uname "2.4.1"
+	call 	*%rdx				# call strcat
+	
+	pop	%rsi  			# restore .txt pointer
+					# source is ", Compiled "
+	call 	*%rdx			# call strcat
+	push	%rsi  			# store for later
+
+	mov	$(uname_info+U_VERSION),%esi	# compiled date
+	call 	*%rdx			# call strcat
+
+	mov	%ebp,%ecx		# move out_buffer to ecx
+
+	mov	$0xa,%ax		# store linefeed on end
+	stosw				# and zero			  
+
+	call	*%rdx			# call strcat
+	
+	call	center_and_print	# center and print
+
+	#===============================
+	# Middle-Line
+	#===============================
+middle_line:		
+	#=========
+	# Load /proc/cpuinfo into buffer
+	#=========
+
+	push	%rdx			# save call pointer
+
+#	push	$SYSCALL_OPEN		# load 5 [ open() ]
+#	pop	%rax			# in 3 bytes
+	
+#	mov	$cpuinfo,%edi		# '/proc/cpuinfo'
+#	xor	%esi,%esi		# 0 = O_RDONLY <bits/fcntl.h>
+#	cdq				# clear edx in clever way
+#	syscall				# syscall.  fd in eax.  
+					# we should check that eax>=0
+					
+#	mov	%eax,%edi		# save our fd
+	
+#	xor	%eax,%eax		# SYSCALL_READ make== 0
+
+	mov	$disk_buffer,%esi
+
+#	mov	$16,%dh		 	# 4096 is maximum size of proc file #)
+					# we load sneakily by knowing
+					# 16<<8 = 4096. be sure edx clear
+
+#	syscall
+
+#	push	$SYSCALL_CLOSE		# close (to be correct)
+#	pop	%rax
+#	syscall			
+
+	#=============
+	# Number of CPUs
+	#=============
+number_of_cpus:
+
+	xor	%ebx,%ebx		# chip count
+	
+					# $disk_buffer still in %rsi
+bogo_loop:	
+	mov	(%rsi), %eax		# load 4 bytes into eax
+	inc	%esi			# increment pointer
+	
+	cmp	$0,%al			# check for end of file
+	je	done_bogo
+	
+	cmp	$('o'<<24+'g'<<16+'o'<<8+'b'),%eax	
+				        # "bogo" in little-endian
+					
+	jne	bogo_loop		# ! if not equal, keep going
+	add	$2,%ebx			# otherwise, we have a bogo
+					# 2 times too for future magic
+	jmp	bogo_loop
+
+done_bogo:
+	lea	one-6(%rbx,%rbx,2), %esi	
+				    	# Load into esi
+					# [one]+(num_cpus*6)
+					#
+					# the above multiplies by three
+					# esi = (ebx+(ebx*2))
+	 				# and we double-incremented ebx 
+					# earlier
+	 
+	mov	%ebp,%edi		# move output buffer to edi
+
+	pop	%rdx			# restore call pointer
+	call	*%rdx			# copy it (call strcat)
+
+	mov	$' ',%al		# print a space
+	stosb
+
+	push %rbx
+	push %rdx			# store strcat pointer
+
+	#=========
+	# MHz
+	#=========
+print_mhz:
+	mov	$('z'<<24+'H'<<16+'M'<<8+' '),%ebx	
+			   		# find ' MHz' and grab up to .
+	                                # we are little endian
+	mov	$'.',%ah
+
+	# below is same as "sub $(strcat-find_string),%edx
+	# gas won't let us force the one-byte constant
+	.byte 0x83,0xEA,strcat-find_string   
+	
+	call	*%rdx			# call find string
+
+	mov	%ebx,%eax  		# clever way to get MHz in, sadly
+	ror	$8,%eax			# not any smaller than a mov
+	stosl	    			
+
+	#=========
+	# Chip Name
+	#=========
+chip_name:	
+	mov	$('e'<<24+'m'<<16+'a'<<8+'n'),%ebx     	
+					# find 'name\t: ' and grab up to \n
+					# we are little endian
+	mov	$' ',%ah
+	call	*%rdx	   		# call find_string
+	stosb
+	call 	skip_spaces
+	
+	pop     %rdx
+	pop     %rbx                    # restore chip count
+	pop     %rsi
+				
+	call    *%rdx                   # ' Processor'
+	cmpb    $2,%bl
+	jne     print_s
+	inc     %rsi   			# ! if singular, skip the s
+print_s:
+        call    *%rdx                   # 's, '
+	
+        push    %rsi                    # restore the values
+	push    %rdx
+			
+	#========
+	# RAM
+	#========
+
+#	push	%rdi	
+#	push    $SYSCALL_SYSINFO	# sysinfo() syscall
+#	pop	%rax	
+#	mov	$sysinfo_buff,%edi
+#	syscall
+#	pop	%rdi
+
+	# The following has to be a 64 bit load, to support
+	# Ram > 4GB
+	mov	(sysinfo_buff+S_TOTALRAM),%rax	# size in bytes of RAM
+	shr	$20,%rax		# divide by 1024*1024 to get M
+	adc	$0, %eax		# round 
+
+	call num_to_ascii
+	
+	pop  %rdx	 		# restore strcat pointer
+	
+	pop     %rsi	 		# print 'M RAM, '
+	call	*%rdx			# call strcat
+
+	push	%rsi
+	
+	#========
+	# Bogomips
+	#========
+	
+	mov	$('s'<<24+'p'<<16+'i'<<8+'m'),%ebx      	
+					# find 'mips\t: ' and grab up to \n
+	mov	$0xa,%ah
+	call	find_string
+
+	pop	%rsi	   		# bogo total follows RAM 
+
+	call 	*%rdx			# call strcat
+
+	push	%rsi
+
+	mov	%ebp,%ecx		# point ecx to out_buffer
+
+	push	%rcx
+	call	center_and_print	# center and print
+
+	#=================================
+	# Print Host Name
+	#=================================
+last_line:
+	mov     %ebp,%edi		# point to output_buffer
+	
+	mov	$(uname_info+U_NODENAME),%esi	# host name from uname()
+	call    *%rdx			# call strcat
+
+	pop	%rcx	      		# ecx is unchanged
+	call	center_and_print	# center and print
+	
+	pop	%rcx			# (.txt) pointer to default_colors
+	
+	call	write_stdout
+
+	#================================
+	# Exit
+	#================================
+exit:
+	push	$SYSCALL_EXIT		# Put exit syscall in rax
+	pop	%rax
+
+	xor	%edi,%edi		# Make return value $0
+	syscall
+
+
+	#=================================
+	# FIND_STRING 
+	#=================================
+	#   ah is char to end at
+	#   ebx is 4-char ascii string to look for
+	#   edi points at output buffer
+
+find_string:
+					
+	mov	$disk_buffer-1,%esi	# look in cpuinfo buffer
+find_loop:
+	inc	%esi
+	cmpb	$0, (%rsi)		# are we at EOF?
+	je	done			# ! if so, done
+
+	cmp	(%rsi), %ebx		# do the strings match?
+	jne	find_loop		# ! if not, loop
+	
+					# ! if we get this far, we matched
+
+find_colon:
+	lodsb				# repeat till we find colon
+	cmp	$0,%al
+	je	done
+	cmp	$':',%al
+	jne	find_colon
+
+skip_spaces:		
+	lodsb				# skip spaces
+	cmp	$0x20,%al		# Loser new intel chips have lots??
+	je	skip_spaces
+	
+store_loop:	 
+	cmp	$0,%al
+	je	done
+	cmp	%ah,%al			# is it end string?
+	je 	almost_done		# ! if so, finish
+	cmp	$'\n',%al
+	je	almost_done
+	stosb				# ! if not store and continue
+	lodsb
+	
+	jmp	store_loop
+	 
+almost_done:	 
+	movb	 $0, (%rdi)	        # replace last value with NUL 
+done:
+	ret
+
+
+	#================================
+	# strcat
+	#================================
+
+strcat:
+	lodsb				# load a byte from [ds:esi]
+	stosb				# store a byte to [es:edi]
+	cmp	$0,%al			# is it zero?
+	jne	strcat			# ! if not loop
+	dec	%edi			# point to one less than null
+	ret				# return
+
+	#==============================
+	# center_and_print
+	#==============================
+	# string to center in ecx
+
+center_and_print:
+	push    %rdx			# save strcat pointer
+	push	%rcx			# save the string pointer
+	inc	%edi			# move to a clear buffer
+	push	%rdi			# save for later
+
+	mov	$('['<<8+27),%ax	# we want to output ^[[
+	stosw
+
+	cdq	      			# clear dx
+	
+str_loop2:				# find end of string	
+	inc	%edx
+	cmpb	$0,(%rcx,%rdx)		# repeat till we find zero
+	jne	str_loop2
+	
+	push	$81	 		# one added to cheat, we don't
+					# count the trailing '\n'
+	pop	%rax
+	
+	cmp	%eax,%edx		# see if we are >=80
+	jl	not_too_big		# ! if so, don't center
+	push	$80
+	pop	%rdx
+	
+not_too_big:			
+	sub	%edx,%eax		# subtract size from 80
+	
+	shr	%eax			# then divide by 2
+	
+	call	num_to_ascii		# print number of spaces
+	mov	$'C',%al		# tack a 'C' on the end
+					# ah is zero from num_to_ascii
+	stosw				# store C and a NULL
+	pop  %rcx			# pop the pointer to ^[[xC
+	
+	call write_stdout		# write to the screen
+	
+done_center:
+	pop  %rcx			# restore string pointer
+	     				# and trickily print the real string
+
+	pop %rdx			# restore strcat pointer
+
+	#================================
+	# WRITE_STDOUT
+	#================================
+	# ecx has string
+	# eax,ebx,ecx,edx trashed
+write_stdout:
+	push    %rdx
+	push	$SYSCALL_WRITE		# put 4 in eax (write syscall)
+	pop     %rax     		# in 3 bytes of code
+	
+	cdq   	      			# clear edx
+	
+	lea	1(%rdx),%edi		# put 1 in ebx (stdout)
+					# in 3 bytes of code
+
+	mov	%ecx,%esi
+	
+str_loop1:
+	inc	%edx
+	cmpb	$0,(%rcx,%rdx)		# repeat till zero
+	jne	str_loop1
+
+	syscall  			# run the syscall
+	pop	%rdx
+	ret
+
+	##############################
+	# num_to_ascii
+	##############################
+	# ax = value to print
+	# edi points to where we want it
+	
+num_to_ascii:
+	push    $10
+	pop     %rbx
+	xor     %ecx,%ecx       # clear ecx
+div_by_10:
+	cdq                     # clear edx
+	div     %ebx            # divide
+	push    %rdx            # save for later
+	inc     %ecx            # add to length counter
+	or      %eax,%eax       # was Q zero?
+	jnz     div_by_10       # ! if not divide again
+	
+write_out:
+	pop     %rax            # restore in reverse order
+	add     $0x30, %al      # convert to ASCII
+	stosb                   # save digit
+	loop    write_out       # loop till done
+	ret
+
+#===========================================================================
+#	section .data
+#===========================================================================
+.data
+
+ver_string:	.ascii	" Version \0"
+compiled_string:	.ascii	", Compiled \0"
+processor:		.ascii  " Processor\0"
+s_comma:		.ascii  "s, \0"
+ram_comma:	.ascii	"M RAM, \0"
+bogo_total:	.ascii	" Bogomips Total\n\0"
+
+default_colors:	.ascii "\033[0m\n\n\0"
+
+cpuinfo:	.ascii	"/proc/cpuinfo\0"
+
+
+one:	.ascii	"One\0\0\0"
+two:	.ascii	"Two\0\0\0"
+three:	.ascii	"Three\0"
+four:	.ascii	"Four\0"
+
+.include	"../logo.lzss_new"
+
+disk_buffer:
+.ascii "processor	: 0\n"
+.ascii "vendor_id	: GenuineIntel\n"
+.ascii "cpu family	: 15\n"
+.ascii "model		: 6\n"
+.ascii "model name	: Intel(R) Xeon(TM) CPU 3.46GHz\n"
+.ascii "stepping	: 4\n"
+.ascii "cpu MHz		: 3200.000\n"
+.ascii "cache size	: 2048 KB\n"
+.ascii "physical id	: 0\n"
+.ascii "siblings	: 2\n"
+.ascii "core id		: 0\n"
+.ascii "cpu cores	: 2\n"
+.ascii "apicid		: 0\n"
+.ascii "initial apicid	: 0\n"
+.ascii "fpu		: yes\n"
+.ascii "fpu_exception	: yes\n"
+.ascii "cpuid level	: 6\n"
+.ascii "wp		: yes\n"
+.ascii "flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc pebs bts pni dtes64 monitor ds_cpl vmx est cid cx16 xtpr pdcm lahf_lm tpr_shadow\n"
+.ascii "bogomips	: 6934.38\n"
+.ascii "clflush size	: 64\n"
+.ascii "cache_alignment	: 128\n"
+.ascii "address sizes	: 36 bits physical, 48 bits virtual\n"
+.ascii "power management:\n"
+.ascii "\n"
+.ascii "processor	: 1\n"
+.ascii "vendor_id	: GenuineIntel\n"
+.ascii "cpu family	: 15\n"
+.ascii "model		: 6\n"
+.ascii "model name	: Intel(R) Xeon(TM) CPU 3.46GHz\n"
+.ascii "stepping	: 4\n"
+.ascii "cpu MHz		: 3200.000\n"
+.ascii "cache size	: 2048 KB\n"
+.ascii "physical id	: 1\n"
+.ascii "siblings	: 2\n"
+.ascii "core id		: 0\n"
+.ascii "cpu cores	: 2\n"
+.ascii "apicid		: 4\n"
+.ascii "initial apicid	: 4\n"
+.ascii "fpu		: yes\n"
+.ascii "fpu_exception	: yes\n"
+.ascii "cpuid level	: 6\n"
+.ascii "wp		: yes\n"
+.ascii "flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc pebs bts pni dtes64 monitor ds_cpl vmx est cid cx16 xtpr pdcm lahf_lm tpr_shadow\n"
+.ascii "bogomips	: 6934.13\n"
+.ascii "clflush size	: 64\n"
+.ascii "cache_alignment	: 128\n"
+.ascii "address sizes	: 36 bits physical, 48 bits virtual\n"
+.ascii "power management:\n\0"
+
+uname_info:
+.ascii "Linux\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "domori\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "2.6.29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "#1 SMP Mon May 4 09:51:54 EDT 2009\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+sysinfo_buff:
+.long 0,0,0,0,0,0,0,0,2048*1024*1024,0,0,0,0,0,0,0
+
+
+#============================================================================
+#	section .bss
+#============================================================================
+.bss
+
+.lcomm  text_buf, (N+F-1)
+.lcomm	out_buffer,16384
diff --git a/exp-bbv/tests/amd64-linux/ll.post.exp b/exp-bbv/tests/amd64-linux/ll.post.exp
new file mode 100644
index 0000000..5417125
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/ll.post.exp
@@ -0,0 +1,54 @@
+T:1:10   :7:10   :5:38   :2:44   :8:65   :9:662   :4:119   :6:2   :3:51   
+T:7:5   :5:16   :2:18   :8:52   :9:858   :4:35   :6:1   :3:15   
+T:7:5   :5:16   :2:18   :8:52   :9:858   :4:35   :6:1   :3:15   
+T:7:5   :5:14   :2:16   :8:91   :9:863   :4:7   :6:1   :3:3   
+T:7:5   :5:12   :2:14   :8:78   :9:880   :4:7   :6:1   :3:3   
+T:7:5   :5:6   :2:8   :8:52   :9:928   :6:1   
+T:7:5   :5:10   :2:10   :8:65   :9:909   :6:1   
+T:7:5   :5:14   :2:18   :8:117   :9:845   :6:1   
+T:5:8   :2:8   :8:52   :9:932   
+T:7:5   :5:8   :2:10   :8:65   :9:911   :6:1   
+T:5:8   :2:8   :8:52   :9:932   
+T:7:5   :5:6   :2:8   :8:52   :9:928   :6:1   
+T:5:6   :2:6   :8:39   :9:949   
+T:7:5   :5:6   :2:8   :8:52   :9:928   :6:1   
+T:5:4   :2:4   :8:26   :9:966   
+T:7:5   :5:12   :2:14   :8:78   :9:880   :4:7   :6:1   :3:3   
+T:5:6   :2:6   :8:39   :9:949   
+T:7:5   :5:8   :2:10   :8:65   :9:911   :6:1   
+T:7:5   :5:14   :2:16   :8:91   :9:863   :4:7   :6:1   :3:3   
+T:5:8   :2:8   :8:52   :9:932   
+T:7:5   :5:10   :2:12   :8:78   :9:894   :6:1   
+T:7:5   :5:10   :2:12   :8:74   :9:898   :6:1   
+T:5:12   :2:12   :8:82   :9:894   
+T:7:5   :5:8   :2:8   :8:39   :9:390   :4:7   :6:1   :3:3   :10:3   :11:9   :12:527   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:15:5   :18:2   :19:3   :20:2   :21:3   :22:4   :16:281   :17:10   :12:687   :13:1   :14:2   
+T:23:1   :32:7   :34:351   :33:176   :16:3   :17:2   :24:10   :25:195   :26:4   :27:3   :30:4   :31:11   :11:9   :12:204   :13:2   :14:4   :28:9   :29:5   
+T:34:666   :33:334   
+T:34:667   :33:333   
+T:34:665   :33:333   :35:2   
+T:34:667   :33:333   
+T:34:667   :33:333   
+T:34:666   :33:334   
+T:34:666   :33:332   :35:2   
+T:34:357   :33:178   :36:4   :37:8   :38:4   :40:258   :39:173   :16:16   :17:2   
+T:49:6   :50:2   :51:4   :52:2   :53:1   :54:6   :56:3   :38:4   :40:333   :39:225   :41:39   :42:26   :43:15   :44:46   :45:46   :46:40   :47:60   :48:6   :16:88   :17:4   :28:9   :55:18   :29:17   
+T:57:4   :38:4   :40:591   :39:395   :16:4   :17:2   
+T:40:600   :39:400   
+T:58:2   :59:4   :40:453   :39:303   :41:18   :42:12   :43:6   :44:16   :45:16   :46:14   :47:21   :48:2   :16:68   :17:2   :24:10   :25:53   
+
+
+# Thread 1
+#   Total intervals: 45 (Interval Size 1000)
+#   Total instructions: 45639
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
diff --git a/exp-bbv/tests/amd64-linux/ll.stderr.exp b/exp-bbv/tests/amd64-linux/ll.stderr.exp
new file mode 100644
index 0000000..3e75445
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/ll.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 45 (Interval Size 1000)
+#   Total instructions: 45639
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/amd64-linux/ll.stdout.exp b/exp-bbv/tests/amd64-linux/ll.stdout.exp
new file mode 100644
index 0000000..61cd23c
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/ll.stdout.exp
@@ -0,0 +1,17 @@
+###############################################################################
+###############################################################################
+##################################################################O#O##########
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+
+Linux Version 2.6.29, Compiled #1 SMP Mon May 4 09:51:54 EDT 2009
+Two 3200MHz Intel(R) Xeon(TM) Processors, 2048M RAM, 6934.38 Bogomips Total
+domori
+
diff --git a/exp-bbv/tests/amd64-linux/ll.vgtest b/exp-bbv/tests/amd64-linux/ll.vgtest
new file mode 100644
index 0000000..6031a58
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/ll.vgtest
@@ -0,0 +1,5 @@
+prog: ll
+vgopts: --interval-size=1000 --bb-out-file=ll.out.bb
+post:	cat ll.out.bb
+cleanup: rm ll.out.bb
+
diff --git a/exp-bbv/tests/amd64-linux/million.S b/exp-bbv/tests/amd64-linux/million.S
new file mode 100644
index 0000000..d72ee4b
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/million.S
@@ -0,0 +1,22 @@
+
+     	     # count for 1 million instructions
+	     #   total is 2 + 1 + 499997*2 + 3
+	     
+	.globl _start	
+_start:	
+	xor	%rcx,%rcx		# not needed, pads total to 1M
+	xor	%rax,%rax		# not needed, pads total to 1M
+	
+	mov	$499997,%rcx		# load counter
+test_loop:	
+	dec	%rcx			# repeat count times
+	jnz	test_loop
+
+	#================================
+	# Exit
+	#================================
+exit:
+	xor     %rdi,%rdi		# we return 0
+	mov	$60,%rax		# put exit syscall number (60) in rax
+	syscall
+
diff --git a/exp-bbv/tests/amd64-linux/million.post.exp b/exp-bbv/tests/amd64-linux/million.post.exp
new file mode 100644
index 0000000..30bdd29
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/million.post.exp
@@ -0,0 +1,21 @@
+T:1:5   :2:99996   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+
+
+# Thread 1
+#   Total intervals: 10 (Interval Size 100000)
+#   Total instructions: 1000000
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
+F:1:400078:
+F:2:400085:
+F:3:40008a:
diff --git a/exp-bbv/tests/amd64-linux/million.stderr.exp b/exp-bbv/tests/amd64-linux/million.stderr.exp
new file mode 100644
index 0000000..adeb35d
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/million.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 10 (Interval Size 100000)
+#   Total instructions: 1000000
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/amd64-linux/million.vgtest b/exp-bbv/tests/amd64-linux/million.vgtest
new file mode 100644
index 0000000..969a636
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/million.vgtest
@@ -0,0 +1,5 @@
+prog: million 
+vgopts: --interval-size=100000 --bb-out-file=million.out.bb --pc-out-file=million.out.pc
+post:	cat million.out.bb million.out.pc
+cleanup: rm million.out.bb million.out.pc
+
diff --git a/exp-bbv/tests/amd64-linux/rep_prefix.S b/exp-bbv/tests/amd64-linux/rep_prefix.S
new file mode 100644
index 0000000..6fe8ac3
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/rep_prefix.S
@@ -0,0 +1,347 @@
+#
+# rep, repe (repz) and repne (repnz) prefixed string instructions
+#   only count as one instruction, even though they repeat many times
+# This test makes sure the bbv plugin counts these instructions properly
+# The answer is validated to hw perf counters.
+#
+
+	.globl _start	
+_start:	
+	cld				# we want these to happen forward
+
+
+	#===============================================
+	# Some SSE2 instructions start with 0xf2 or 0xf3
+	# Check for them, to make sure our rep detection
+	#   handles things properly.
+	# We should check this on x86 too, but then we'd
+	#   have to check for SSE2 capability somehow?
+	#===================================
+false_positives:
+
+	movdqu	%xmm1,%xmm2
+	movdqu	%xmm2,%xmm1
+	addsd	%xmm1,%xmm2
+	pause
+
+	#===================================
+	# Check varied order of the size prefix
+	#   with the rep prefix.  Older binutils
+	#   did this one way, newer binutils the other
+	#===================================
+	
+size_prefix:
+	# test 16-bit load
+	
+	mov	$8192, %rcx
+	mov	$buffer1, %rsi		# set source
+	.byte 0x66, 0xf3, 0xad		# lodsw
+	
+	mov	$8192, %rcx
+	mov	$buffer1, %rsi		# set source
+	.byte 0xf3, 0x66, 0xad		# lodsw	
+	
+	
+
+
+	#===================================
+	# Load and Store Instructions
+	#===================================
+loadstore:
+	xor	%rax, %rax
+	mov	$0xd, %al		# set eax to d
+	
+	# test 8-bit store
+	
+	mov	$16384, %rcx
+	mov	$buffer1, %rdi		# set destination
+	rep	stosb	    		# store d 16384 times, auto-increment
+	
+	# test 8-bit load
+	
+	mov	$16384, %rcx
+	mov	$buffer1, %rsi		# set source
+	rep	lodsb	    		# load byte 16384 times, auto-increment
+
+	cmp	$0xd,%al		# if we loaded wrong value
+	jne	print_error		# print an error
+
+	# test 16-bit store
+	
+	mov    	$0x020d,%ax		# store 0x020d
+	
+	mov	$8192, %rcx
+	mov	$buffer1, %rdi		# set destination
+	rep	stosw	    		# store 8192 times, auto-increment
+	
+	# test 16-bit load
+	
+	mov	$8192, %rcx
+	mov	$buffer1, %rsi		# set source
+	rep	lodsw	    		# load 8192 times, auto-increment
+
+	cmp	$0x020d,%ax		# if we loaded wrong value
+	jne	print_error		# print an error
+
+	# test 32-bit store
+	
+	mov    	$0x0feb1378,%eax	# store 0x0feb1378
+	
+	mov	$4096, %rcx
+	mov	$buffer1, %rdi		# set destination
+	rep	stosl	    		# store 4096 times, auto-increment
+	
+	# test 32-bit load
+	
+	mov	$4096, %rcx
+	mov	$buffer1, %rsi		# set source
+	rep	lodsl	    		# load 4096 times, auto-increment
+
+	cmp	$0x0feb1378,%eax	# if we loaded wrong value
+	jne	print_error		# print an error
+	
+	# test 64-bit store
+	
+	mov    	$0xfeb131978a5a5a5a,%rax	
+						
+	mov	$2048, %rcx
+	mov	$buffer1, %rdi		# set destination
+	rep	stosq	    		# store 2048 times, auto-increment
+	
+	# test 64-bit load
+	
+	mov	$2048, %rcx
+	mov	$buffer1, %rsi		# set source
+	rep	lodsq	    		# load 2048 times, auto-increment
+
+	cmp     $0x8a5a5a5a,%eax
+					# !if we loaded wrong value
+	jne	print_error		# print an error
+	
+
+	#=============================
+	# Move instructions
+	#=============================
+moves:
+	# test 8-bit move
+	
+	mov    $16384, %rcx
+	mov    $buffer1, %rsi
+	mov    $buffer2, %rdi
+	rep    movsb
+
+	# test 16-bit move
+	
+	mov    $8192, %rcx
+	mov    $buffer2, %rsi
+	mov    $buffer1, %rdi
+	rep    movsw
+
+	# test 32-bit move
+	
+	mov    $4096, %rcx
+	mov    $buffer1, %rsi
+	mov    $buffer2, %rdi
+	rep    movsl	
+	
+	# test 64-bit move
+	
+	mov    $2048, %rcx
+	mov    $buffer1, %rsi
+	mov    $buffer2, %rdi
+	rep    movsq		
+	
+
+	#==================================
+	# Compare equal instructions
+	#==================================
+compare_equal:	
+	# first set up the areas to compare
+	
+	mov	$0xa5a5a5a5,%eax
+	mov	$buffer1, %rdi
+	mov	$4096, %rcx
+	rep	stosl
+	
+	mov	$0xa5a5a5a5,%eax
+	mov	$buffer2, %rdi
+	mov	$4096, %rcx
+	rep	stosl
+
+
+	# test 8-bit
+	
+	mov	$buffer1,%rsi
+	mov	$buffer2,%rdi
+	mov	$16384, %rcx
+	repe	cmpsb
+	jnz	print_error
+
+	# test 16-bit
+	
+	mov	$buffer1,%rsi
+	mov	$buffer2,%rdi
+	mov	$8192, %rcx
+	repe	cmpsw
+	jnz	print_error
+
+	# test 32-bit
+	
+	mov	$buffer1,%rsi
+	mov	$buffer2,%rdi
+	mov	$4096, %rcx
+	repe	cmpsl
+	jnz	print_error		
+	
+	# test 64-bit
+	
+	mov	$buffer1,%rsi
+	mov	$buffer2,%rdi
+	mov	$2048, %rcx
+	repe	cmpsq
+	jnz	print_error			
+
+
+
+	#==================================
+	# Compare not equal instructions
+	#==================================
+compare_noteq:	
+	# change second buffer
+	
+	mov	$0x5a5a5a5a,%eax
+	mov	$buffer2, %rdi
+	mov	$4096, %rcx
+	rep	stosl
+	
+	# test 8-bit
+	
+	mov	$buffer1,%rsi
+	mov	$buffer2,%rdi
+	mov	$16384, %rcx
+#	repne	cmpsb             FIXME!  Not implemented valgrind
+#	je	print_error
+
+	# test 16-bit
+	
+	mov	$buffer1,%rsi
+	mov	$buffer2,%rdi
+	mov	$8192, %rcx
+#	repne	cmpsw             FIXME!  Not implemented valgrind
+#	je	print_error	
+
+	# test 32-bit
+	
+	mov	$buffer1,%rsi
+	mov	$buffer2,%rdi
+	mov	$4096, %rcx
+#	repne	cmpsl             FIXME!  Not implemented valgrind
+#	je	print_error			
+
+	# test 64-bit
+	
+	mov	$buffer1,%rsi
+	mov	$buffer2,%rdi
+	mov	$2048, %rcx
+#	repne	cmpsq             FIXME!  Not implemented valgrind
+#	je	print_error			
+
+	#====================================
+	# Check scan equal instruction
+	#====================================
+scan_eq:
+	# test 8-bit
+
+	mov     $0xa5,%al
+	mov	$buffer1,%rdi
+	mov	$16384, %rcx
+	repe	scasb
+	jnz	print_error
+
+	# test 16-bit
+	
+	mov     $0xa5a5,%ax
+	mov	$buffer1,%rdi
+	mov	$8192, %rcx
+	repe	scasw
+	jnz	print_error	
+
+	# test 32-bit
+	
+	mov	$0xa5a5a5a5,%eax
+	mov	$buffer1,%rdi
+	mov	$4096, %rcx
+	repe	scasl
+	jnz	print_error		
+	
+	# test 64-bit
+	
+	mov	$0xa5a5a5a5a5a5a5a5,%rax
+	mov	$buffer1,%rdi
+	mov	$2048, %rcx
+	repe	scasq
+	jnz	print_error			
+	
+
+	#====================================
+	# Check scan not-equal instruction
+	#====================================
+
+	# test 8-bit
+scan_ne:
+	mov     $0xa5,%al
+	mov	$buffer2,%rdi
+	mov	$16384, %rcx
+	repne	scasb
+	jz	print_error
+
+	# test 16-bit
+	
+	mov     $0xa5a5,%ax
+	mov	$buffer2,%rdi
+	mov	$8192, %rcx
+	repne	scasw
+	jz	print_error	
+	
+	# test 32-bit
+	
+	mov	$0xa5a5a5a5,%eax
+	mov	$buffer2,%rdi
+	mov	$4096, %rcx
+	repne	scasl
+	jz	print_error		
+	
+	# test 64-bit
+	
+	mov	$0xa5a5a5a5a5a5a5a5,%rax
+	mov	$buffer2,%rdi
+	mov	$2048, %rcx
+	repne	scasq
+	jz	print_error			
+
+	jmp	exit			# no error, skip to exit
+	
+print_error:
+	    
+	mov 	$1, %rax		# Write syscall
+	mov	$1, %rdi		# print to stdout
+	mov	$error_string, %rsi	# string to print
+	mov	$16, %edx      	   	# strlen
+	syscall	 			# call syscall
+
+	#================================
+	# Exit
+	#================================
+exit:
+     	mov	$60,%rax
+	xor     %rdi,%rdi		# we return 0
+	syscall             		# and exit
+
+
+.data
+error_string:	.asciz "Error detected!\n"
+
+.bss
+
+.lcomm	buffer1,	16384
+.lcomm	buffer2,	16384
diff --git a/exp-bbv/tests/amd64-linux/rep_prefix.stderr.exp b/exp-bbv/tests/amd64-linux/rep_prefix.stderr.exp
new file mode 100644
index 0000000..2ca3548
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/rep_prefix.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 0 (Interval Size 100000)
+#   Total instructions: 152
+#   Total reps: 165917
+#   Unique reps: 29
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/amd64-linux/rep_prefix.vgtest b/exp-bbv/tests/amd64-linux/rep_prefix.vgtest
new file mode 100644
index 0000000..bc89a1c
--- /dev/null
+++ b/exp-bbv/tests/amd64-linux/rep_prefix.vgtest
@@ -0,0 +1,4 @@
+prog: rep_prefix
+vgopts: --interval-size=100000 --bb-out-file=rep_prefix.out.bb
+cleanup: rm rep_prefix.out.bb
+
diff --git a/exp-bbv/tests/filter_bb b/exp-bbv/tests/filter_bb
new file mode 100644
index 0000000..ca9345f
--- /dev/null
+++ b/exp-bbv/tests/filter_bb
@@ -0,0 +1,12 @@
+#! /bin/sh
+
+dir=`dirname $0`
+
+$dir/../../tests/filter_stderr_basic  |
+
+# This attempts to filter out the basic block numbers
+# While keeping total count.  This is because the
+#  basic block number is non-deterministic on a
+#  multi-threaded benchmark
+
+sed s/:\[0-9\]\*:/' '/g
diff --git a/exp-bbv/tests/filter_stderr b/exp-bbv/tests/filter_stderr
new file mode 100644
index 0000000..15b6f6e
--- /dev/null
+++ b/exp-bbv/tests/filter_stderr
@@ -0,0 +1,13 @@
+#! /bin/sh
+
+dir=`dirname $0`
+
+$dir/../../tests/filter_stderr_basic  |
+
+# Remove lines that don't start with #
+sed '/^[^#]/d' |
+
+# Remove all blank lines
+sed '/^$/d'
+
+
diff --git a/exp-bbv/tests/logo.include b/exp-bbv/tests/logo.include
new file mode 100644
index 0000000..e5aac17
--- /dev/null
+++ b/exp-bbv/tests/logo.include
@@ -0,0 +1,6 @@
+.equ FREQUENT_CHAR,0
+.equ N,1024
+.equ F,64
+.equ THRESHOLD,2
+.equ P_BITS,10
+.equ POSITION_MASK,3
diff --git a/exp-bbv/tests/logo.lzss_new b/exp-bbv/tests/logo.lzss_new
new file mode 100644
index 0000000..626bf0e
--- /dev/null
+++ b/exp-bbv/tests/logo.lzss_new
@@ -0,0 +1,21 @@
+logo:
+	.byte	255,27,91,48,59,49,59,51,55
+	.byte	159,59,52,55,109,35,204,247,192,7,51
+	.byte	141,48,200,27,27,91,196,7,203,31,28,12,59
+	.byte	15,52,48,109,10,192,247,1,96,26,56,44,156
+	.byte	31,27,91,51,49,109,204,4,65,172,13,36
+	.byte	2,28,16,79,13,32,16,65,147,152,131,52,28,52,204,16
+	.byte	16,12,36,111,57,236,167,28,8,51,22,20,137,85,44,96
+	.byte	0,43,97,214,113,226,200,203,8,212,9,211,16,43,89,245,209
+	.byte	0,128,17,210,24,13,40,28,20,13,44,28,28,240,74,26,91
+	.byte	0,13,80,95,101,135,101,43,85,245,205,205,40,205,20,137,65
+	.byte	0,29,135,66,75,114,83,28,120,15,98,135,109,85,88,247,193
+	.byte	0,232,43,244,151,73,120,61,176,27,95,151,176,18,43,171,202
+	.byte	16,223,22,26,245,90,245,217,63,51,27,86,146,91,176,2
+	.byte	0,12,29,211,200,172,57,23,102,50,246,110,109,236,68,96,94
+	.byte	8,175,10,166,105,20,1,48,51,11,222,31,49,15,211,188
+	.byte	0,175,79,25,86,170,69,82,219,40,82,70,127,8,83,219,35
+	.byte	0,169,85,170,53,24,33,18,104,145,42,200,34,178,104,112,45
+	.byte	0,198,80,178,121,145,74,112,49,248,81,243,40,221,23,255,23
+	.byte	8,2,54,3,36,229,66,10
+logo_end:
diff --git a/exp-bbv/tests/ppc32-linux/Makefile.am b/exp-bbv/tests/ppc32-linux/Makefile.am
new file mode 100644
index 0000000..d022cf7
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/Makefile.am
@@ -0,0 +1,22 @@
+include $(top_srcdir)/Makefile.tool-tests.am
+
+dist_noinst_SCRIPTS = filter_stderr
+
+check_PROGRAMS = \
+	million ll
+
+EXTRA_DIST = \
+	   ll.stderr.exp \
+	   ll.stdout.exp \
+	   ll.post.exp \
+	   ll.vgtest \
+	   million.stderr.exp \
+	   million.post.exp \
+	   million.vgtest
+
+AM_CCASFLAGS += -ffreestanding
+
+LDFLAGS += -nostartfiles -nodefaultlibs
+
+ll_SOURCES = ll.S
+million_SOURCES = million.S
diff --git a/exp-bbv/tests/ppc32-linux/filter_stderr b/exp-bbv/tests/ppc32-linux/filter_stderr
new file mode 100644
index 0000000..1c07666
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/filter_stderr
@@ -0,0 +1,5 @@
+#! /bin/sh
+
+../filter_stderr
+
+
diff --git a/exp-bbv/tests/ppc32-linux/ll.S b/exp-bbv/tests/ppc32-linux/ll.S
new file mode 100644
index 0000000..7621b95
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/ll.S
@@ -0,0 +1,579 @@
+#
+#  linux_logo in ppc assembly language
+#    based on the code from ll_asm-0.36
+#
+#  By Vince Weaver <vince _at_ deater.net>
+#
+# Modified to remove non-deterministic system calls
+# And to avoid reading from /proc
+#
+
+# offsets into the results returned by the uname syscall
+.equ U_SYSNAME,0
+.equ U_NODENAME,65
+.equ U_RELEASE,65*2
+.equ U_VERSION,(65*3)
+.equ U_MACHINE,(65*4)
+.equ U_DOMAINNAME,65*5
+
+# offset into the SYSCALL_SYSINFO buffer
+.equ S_TOTALRAM,16
+
+# Sycscalls
+.equ SYSCALL_EXIT,     1
+#.equ SYSCALL_READ,     3
+.equ SYSCALL_WRITE,    4
+#.equ SYSCALL_OPEN,     5
+#.equ SYSCALL_CLOSE,    6
+#.equ SYSCALL_SYSINFO,116
+#.equ SYSCALL_UNAME,  122
+
+#
+.equ STDIN, 0
+.equ STDOUT,1
+.equ STDERR,2
+
+.equ BSS_BEGIN,25
+.equ DATA_BEGIN,26
+
+.include "../logo.include"
+
+	.globl _start	
+_start:	
+
+        #========================
+	# Initialization
+	#========================
+	
+
+#	eieio				# coolest opcode of all time ;)
+					# not needed, but I had to put it here
+  	# the hack loading BSS_BEGIN and DATA_BEGIN
+	# saves one instruction on any future load from memory
+	# as we can just do an addi rather than an lis;addi
+
+	lis	25,bss_begin@ha
+	addi	25,25,bss_begin@l
+	
+	lis	26,data_begin@ha
+	addi	26,26,data_begin@l
+
+	addi	14,BSS_BEGIN,(out_buffer-bss_begin)
+					# the output buffer
+
+	addi	21,BSS_BEGIN,(text_buf-bss_begin)
+ 	     	
+
+	mr	17,14		    	# store out-buffer for later
+
+        #=========================
+	# PRINT LOGO
+	#=========================
+
+# LZSS decompression algorithm implementation
+# by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989
+# optimized some more by Vince Weaver
+
+
+	li	8,(N-F)			# grab "R"
+
+	addi	9,DATA_BEGIN,(logo-data_begin)-1
+					# logo_pointer
+
+	addi	12,DATA_BEGIN,(logo_end-data_begin)-1
+					# end of the logo
+
+
+	mr      16,17
+
+decompression_loop:
+	lbzu 	10,1(9)			# load in a byte
+					# auto-update
+	mr	11,10			# copy to 11
+	ori	11,11,0xff00		# re-load top as a hackish 
+					# 8-bit counter
+
+test_flags:
+	cmpw	0,12,9			# have we reached the end?
+	ble	done_logo		# ! if so exit
+
+	andi.	13,11,0x1
+	srawi   11,11,1
+	
+	bne	0,discrete_char
+
+offset_length:
+	lbzu  	10,1(9)
+	lbzu	24,1(9)
+	slwi	24,24,8
+	or	24,24,10
+	
+	mr	10,24
+
+	srawi  15,10,P_BITS
+	addi   15,15,THRESHOLD+1 # cl = ax >> (P_BITS)+THRESH+1
+	       			 # = match length
+	       			 
+output_loop:
+	andi.  24,24,(POSITION_MASK<<8+0xff)	# mask it
+	lbzx   10,21,24				
+	addi   24,24,1
+	
+store_byte:
+	stbu   10,1(16)
+	
+	stbx    10,21,8
+	addi	8,8,1
+	andi.	8,8,(N-1)
+
+	addic.	15,15,-1
+	bne	0,output_loop
+	
+	andi.	13,11,0xff00
+	bne	test_flags
+	
+	b	decompression_loop
+
+discrete_char:
+
+	lbzu    10,1(9)
+	li	15,1
+
+	b       store_byte
+
+done_logo:
+
+	addi	4,17,1		# restore (plus one because r17 is decremented)
+	bl	write_stdout	# and print the logo
+	
+
+        #==========================
+	# First Line
+	#==========================
+
+	
+	#==========================
+	# PRINT VERSION
+	#==========================
+	
+#	li	0,SYSCALL_UNAME		# uname syscall
+#	addi	3,BSS_BEGIN,(uname_info-bss_begin)		
+					# uname struct
+#	sc				# do syscall
+
+
+	addi	16,DATA_BEGIN,(uname_info-data_begin)+U_SYSNAME@l-1	
+					# os-name from uname "Linux"
+	bl	strcat
+	
+	addi	16,DATA_BEGIN,(ver_string-data_begin)-1
+					# source is " Version "
+	bl 	strcat
+	
+	addi	16,DATA_BEGIN,(uname_info-data_begin)+U_RELEASE@l-1
+					# version from uname "2.4.1"
+	bl 	strcat
+	
+	addi	16,DATA_BEGIN,(compiled_string-data_begin)-1
+					# source is ", Compiled "
+	bl 	strcat
+
+	addi	16,DATA_BEGIN,(uname_info-data_begin)+U_VERSION-1
+      					# compiled date
+	bl 	strcat
+	
+	bl	center_and_print	# write it to screen
+	
+
+	#===============================
+	# Middle-Line
+	#===============================
+	
+	#=========
+	# Load /proc/cpuinfo into buffer
+	#=========
+
+#	li	0,SYSCALL_OPEN		# open()
+#	addi	3,DATA_BEGIN,(cpuinfo-data_begin)		
+					# '/proc/cpuinfo'
+#	li	4,0			# O_RDONLY <bits/fcntl.h>
+#	sc				# syscall.  fd in r0.  
+					# we should check that r0>=0
+					
+#	mr	13,3			# save fd in r13
+	
+#	li	0,SYSCALL_READ		# read
+#	addi	4,BSS_BEGIN,(disk_buffer-bss_begin)
+#	li	5,4096		 	# 4096 is maximum size of proc file ;)
+#	sc	
+
+#	mr	3,13			# restore fd
+#	li	0,6			# close
+#	sc
+
+	#=============
+	# Number of CPUs
+	#=============
+	
+	mr	14,17 			# point output to out_buf
+
+	# Assume 1 CPU for now
+	# my iBook's /proc/cpuinfo does not have a "processor" line ???
+	
+	addi	16,DATA_BEGIN,(one-data_begin)-1
+	bl	strcat
+	
+	#=========
+	# MHz
+	#=========
+	
+    	lis	20,('l'<<8)+'o'		# find 'lock ' and grab up to M
+	addi	20,20,('c'<<8)+'k'
+	li	23,'M'			
+   	bl	find_string
+   
+	addi	16,DATA_BEGIN,(megahertz-data_begin)-1
+					# print 'MHz '
+	bl	strcat
+   
+  
+	#=========
+	# Chip Name
+	#=========
+	
+   	lis     20,('c'<<8)+'p'     	# find 'cpu\t: ' and grab up to \n
+	addi	20,20,('u'<<8)+'\t'
+	li	23,'\n'
+	bl	find_string
+	
+	addi	16,DATA_BEGIN,(comma-data_begin)-1
+					# print ', '
+	bl	strcat
+	
+	#========
+	# RAM
+	#========
+	
+#	li	0,SYSCALL_SYSINFO	# sysinfo() syscall
+#	addi	3,BSS_BEGIN,(sysinfo_buff-bss_begin)
+					# sysinfo_buffer
+
+#	sc
+
+	lwz	4,(sysinfo_buff+S_TOTALRAM-data_begin)(DATA_BEGIN)
+					# load bytes of RAM into r4
+
+	srawi	4,4,20		# divide by 2^20 to get MB
+	li	5,0
+
+	bl	num_to_ascii
+
+	addi	16,DATA_BEGIN,(ram_comma-data_begin)-1
+					# print 'M RAM, '
+
+	bl	strcat
+	
+	#========
+	# Bogomips
+	#========
+	
+	lis	20,('m'<<8)+'i'		# find 'mips' and grab up to \n
+	addi	20,20,('p'<<8)+'s'
+	li	23,'\n'
+	bl	find_string
+      
+	addi	16,DATA_BEGIN,(bogo_total-data_begin)-1
+					# print "Bogomips Total"
+	bl	strcat
+
+	bl	center_and_print	# center it
+
+
+	#=================================
+	# Print Host Name
+	#=================================
+	
+	mr	14,17			# restore out buffer
+	
+	addi	16,DATA_BEGIN,((uname_info-data_begin)+U_NODENAME)-1
+					# hostname		      
+					
+	bl	strcat				
+	
+	bl	center_and_print
+
+	#================================
+	# Exit
+	#================================
+exit:	
+        li      3,0		# 0 exit value
+	li      0,SYSCALL_EXIT  # put the exit syscall number in eax
+	sc	             	# and exit
+
+
+
+
+	#=================================
+	# FIND_STRING 
+	#=================================
+	#   r23 is char to end at
+	#   r20 is the 4-char ascii string to look for
+	#   r14 points at output buffer
+	#   r16,r21
+
+find_string:
+		
+	addi	16,DATA_BEGIN,(disk_buffer-data_begin)-1	
+					# look in cpuinfo buffer
+					# -1 so we can use lbzu
+	
+find_loop:
+	lwzu	13,1(16)		# load in 32 bits, incrementing 8bits
+	cmpwi	13,0			# ! if null, we are done
+	beq	done
+	cmpw	13,20			# compare with out 4 char string
+	bne	find_loop		# ! if no match, keep looping
+
+	
+					# ! if we get this far, we matched
+					
+	li	21,':'
+find_colon:
+	lbzu	13,1(16)		# repeat till we find colon
+	cmpwi	13,0
+	beq	done
+	cmpw	13,21
+	bne	find_colon
+
+	addi	16,16,1			# skip a char [should be space]
+	
+store_loop:	 
+	 lbzu	13,1(16)
+	 cmpwi	13,0
+	 beq	done
+    	 cmpw	13,23			# is it end string?
+	 beq 	almost_done		# ! if so, finish
+	 stbu	13,1(14)		# ! if not store and continue
+	 b	store_loop
+	 
+almost_done:	 
+	li	13,0			# replace last value with null
+	stb	13,1(14)
+
+done:
+	blr
+
+	#================================
+	# strcat
+	#================================
+	# r13 = "temp"
+	# r16 = "source"
+       	# r14 = "destination"
+strcat:
+	lbzu	13,1(16)		# load a byte from [r16]
+	stbu	13,1(14)		# store a byte to [r14]
+	cmpwi	13,0			# is it zero?
+	bne	strcat			# ! if not loop
+	subi	14,14,1			# point to one less than null
+	blr				# return
+
+	#==============================
+	# center_and_print
+	#==============================
+	# r14 is end of buffer
+	# r17 is start of buffer
+	# r29 = saved link register
+	# r4-r10, r19-r22, r30 trashed
+	
+center_and_print:
+
+	mflr 	29			# back up return address
+
+	subf	5,17,14			# see how long the output
+					# buffer is
+					
+	cmpwi	5,80			# see if we are >80
+        bgt	done_center		# ! if so, bail
+
+	li	4,80			# 80 column screen
+	subf	4,5,4			# subtract strlen
+	srawi	23,4,1			# divide by two
+
+	lis	4,escape@ha
+	addi	4,4,escape@l
+	bl	write_stdout
+
+	mr	4,23
+	li	5,1			# print to stdout
+	bl	num_to_ascii		# print number
+	
+	lis	4,c@ha
+	addi	4,4,c@l
+	bl	write_stdout
+
+
+done_center:	
+
+	addi	4,17,1			# move string to output+1
+	bl	write_stdout		# call write stdout
+
+	lis	4,linefeed@ha
+	addi	4,4,linefeed@l
+
+	mtlr	29	      		# restore link register
+					# and let write_stdout
+					# return for us
+
+
+
+	#================================
+	# WRITE_STDOUT
+	#================================
+	# r4 has string
+	# r0,r3,r4,r5,r6 trashed
+		
+write_stdout:
+	li	0,SYSCALL_WRITE		# write syscall
+	li	3,STDOUT		# stdout	
+	
+	li	5,0			# string length counter
+strlen_loop:
+	lbzx 	6,4,5			# get byte from (r4+r5)
+       	addi	5,5,1			# increment counter
+	cmpi	0,6,0			# is it zero?
+	bne	strlen_loop		# ! if not keep counting
+	addi	5,5,-1
+	sc				# syscall
+	
+	blr				# return
+
+
+	##############################
+	# Num to Ascii
+	##############################
+	# num is in r4
+	# r5 =0 then strcat, otherwise stdout
+	# r5-r10,r19,r20,r21,r22,r30 trashed	
+
+num_to_ascii:
+
+	mflr    30			# save the link register
+
+	addi	16,BSS_BEGIN,(num_to_ascii_end-bss_begin)
+					# the end of a backwards growing
+					# 10 byte long buffer.  
+					
+	li	20,10			# we will divide by 10
+	mr	19,4			# load in the value passed
+	
+div_by_10:
+	divw	21,19,20		# divide r19 by r20 put into r21 
+	
+	mullw	22,21,20		# find remainder.  1st q*dividend
+	subf	22,22,19		# then subtract from original = R
+	addi	22,22,0x30		# convert remainder to ascii
+    	
+	stbu	22,-1(16)		# Store to backwards buffer
+	
+	mr	19,21			# move Quotient as new dividend
+	cmpwi	19,0			# was quotient zero?
+	bne    	div_by_10		# ! if not keep dividing
+	
+write_out:
+	cmpwi	5,0			# ! if r5 is 0 then skip ahead
+	bne 	stdout_num		
+
+	addi	16,16,-1		# point to the beginning
+	bl	strcat			# and strcat it
+
+	mtlr	30			# restore link register
+
+	blr				# return
+	
+stdout_num:
+        mr	4,16			# point to our buffer
+	mtlr	30			# restore link register
+	b	write_stdout		# stdout will return for us
+
+
+#===========================================================================
+.data
+#===========================================================================
+
+
+data_begin:
+
+.include "../logo.lzss_new"
+
+ver_string:	.ascii	" Version \0"
+compiled_string:	.ascii	", Compiled \0"
+megahertz:	.ascii	"MHz PPC \0"
+.equ space, ram_comma+6
+.equ comma, ram_comma+5
+linefeed:   	.ascii  "\n\0"
+escape:		.ascii	"\033[\0"
+c:		.ascii  "C\0"
+ram_comma:	.ascii	"M RAM, \0"
+
+bogo_total:	.ascii	" Bogomips Total\0"
+
+default_colors:	.ascii	"\033[0m\n\n\0"
+
+cpuinfo:	.ascii	"/proc/cpuinfo\0"
+
+one:	.ascii	"One \0"
+
+disk_buffer:
+.ascii "processor	: 0\n"
+.ascii "cpu		: 745/755\n"
+.ascii "temperature 	: 22-24 C (uncalibrated)\n"
+.ascii "clock		: 600.000000MHz\n"
+.ascii "revision	: 51.17 (pvr 0008 3311)\n"
+.ascii "bogomips	: 49.79\n"
+.ascii "timebase	: 24960000\n"
+.ascii "platform	: PowerMac\n"
+.ascii "model		: PowerBook4,1\n"
+.ascii "machine		: PowerBook4,1\n"
+.ascii "motherboard	: PowerBook4,1 MacRISC2 MacRISC Power Macintosh\n"
+.ascii "detected as	: 257 (iBook 2)\n"
+.ascii "pmac flags	: 0000001b\n"
+.ascii "L2 cache	: 256K unified\n"
+.ascii "pmac-generation	: NewWorld\n\0"
+
+uname_info:
+.ascii "Linux\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "henparma\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "2.6.29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "#1 Wed May 13 15:51:54 UTC 2009\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+	
+sysinfo_buff:
+.long 0,0,0,0,512*1024*1024,0,0,0
+
+#============================================================================
+#.bss
+#============================================================================
+
+.lcomm bss_begin,0
+.lcomm	num_to_ascii_buff,10
+.lcomm num_to_ascii_end,1
+.lcomm  text_buf, (N+F-1)	# These buffers must follow each other
+.lcomm	out_buffer,16384
+
+
+
+
+
+
+
+
+
+
diff --git a/exp-bbv/tests/ppc32-linux/ll.post.exp b/exp-bbv/tests/ppc32-linux/ll.post.exp
new file mode 100644
index 0000000..28c9d41
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/ll.post.exp
@@ -0,0 +1,49 @@
+T:1:16   :8:10   :6:32   :2:56   :9:48   :10:666   :4:90   :5:36   :7:2   :3:45   
+T:8:5   :6:20   :2:34   :9:80   :10:775   :4:42   :5:22   :7:1   :3:21   
+T:8:5   :6:16   :2:27   :9:64   :10:824   :4:30   :5:18   :7:1   :3:15   
+T:8:5   :6:10   :2:18   :9:80   :10:865   :4:6   :5:12   :7:1   :3:3   
+T:8:5   :6:10   :2:18   :9:96   :10:858   :5:12   :7:1   
+T:8:5   :6:10   :2:18   :9:80   :10:865   :4:6   :5:12   :7:1   :3:3   
+T:6:6   :2:9   :9:36   :10:943   :5:6   
+T:8:5   :6:8   :2:15   :9:92   :10:869   :5:10   :7:1   
+T:6:14   :2:21   :9:112   :10:839   :5:14   
+T:8:5   :6:6   :2:12   :9:64   :10:902   :5:10   :7:1   
+T:8:5   :6:8   :2:15   :9:80   :10:883   :5:8   :7:1   
+T:6:8   :2:12   :9:64   :10:908   :5:8   
+T:6:6   :2:9   :9:48   :10:931   :5:6   
+T:8:5   :6:4   :2:9   :9:48   :10:927   :5:6   :7:1   
+T:6:6   :2:9   :9:48   :10:931   :5:6   
+T:8:5   :6:6   :2:12   :9:64   :10:904   :5:8   :7:1   
+T:6:2   :2:3   :9:16   :10:977   :5:2   
+T:8:5   :6:12   :2:21   :9:96   :10:842   :4:6   :5:14   :7:1   :3:3   
+T:6:6   :2:9   :9:48   :10:931   :5:6   
+T:6:6   :2:9   :9:48   :10:931   :5:6   
+T:8:5   :6:14   :2:24   :9:112   :10:819   :4:6   :5:16   :7:1   :3:3   
+T:8:5   :6:6   :2:12   :9:64   :10:904   :5:8   :7:1   
+T:6:6   :2:9   :9:48   :10:931   :5:6   
+T:8:5   :6:8   :2:15   :9:80   :10:881   :5:10   :7:1   
+T:8:5   :6:10   :2:18   :9:96   :10:858   :5:12   :7:1   
+T:6:10   :2:15   :9:80   :10:885   :5:10   
+T:8:5   :6:10   :2:15   :9:64   :10:470   :4:6   :5:12   :7:1   :3:3   :11:2   :12:7   :13:405   
+T:13:1000   
+T:13:1000   
+T:13:1000   
+T:13:1000   
+T:13:1000   
+T:13:1000   
+T:13:1000   
+T:13:1000   
+T:13:1000   
+T:13:1000   
+T:16:2   :19:2   :20:2   :21:2   :22:2   :23:1   :17:268   :18:10   :24:4   :25:6   :26:3   :30:3   :31:2   :12:28   :13:636   :14:8   :15:4   :27:12   :28:2   :29:3   
+T:33:3   :34:4   :46:2   :47:4   :48:2   :49:4   :53:2   :54:4   :35:8   :37:246   :36:168   :38:8   :40:27   :39:22   :41:8   :44:51   :42:38   :43:34   :45:6   :17:116   :18:10   :32:10   :13:183   :14:4   :15:2   :27:12   :50:16   :28:2   :51:2   :52:2   
+T:55:2   :56:1   :35:4   :37:381   :36:256   :38:4   :40:12   :39:10   :41:4   :44:15   :42:12   :43:10   :45:3   :17:64   :18:2   :24:4   :25:6   :26:3   :30:3   :31:2   :12:28   :13:148   :14:6   :15:3   :27:12   :28:2   :29:3   
+
+
+# Thread 1
+#   Total intervals: 40 (Interval Size 1000)
+#   Total instructions: 40330
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
diff --git a/exp-bbv/tests/ppc32-linux/ll.stderr.exp b/exp-bbv/tests/ppc32-linux/ll.stderr.exp
new file mode 100644
index 0000000..60e953f
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/ll.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 40 (Interval Size 1000)
+#   Total instructions: 40330
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/ppc32-linux/ll.stdout.exp b/exp-bbv/tests/ppc32-linux/ll.stdout.exp
new file mode 100644
index 0000000..b296561
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/ll.stdout.exp
@@ -0,0 +1,16 @@
+###############################################################################
+###############################################################################
+##################################################################O#O##########
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+
+Linux Version 2.6.29, Compiled #1 Wed May 13 15:51:54 UTC 2009
+One 600.000000MHz PPC 745/755, 512M RAM, 49.79 Bogomips Total
+henparma
diff --git a/exp-bbv/tests/ppc32-linux/ll.vgtest b/exp-bbv/tests/ppc32-linux/ll.vgtest
new file mode 100644
index 0000000..6031a58
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/ll.vgtest
@@ -0,0 +1,5 @@
+prog: ll
+vgopts: --interval-size=1000 --bb-out-file=ll.out.bb
+post:	cat ll.out.bb
+cleanup: rm ll.out.bb
+
diff --git a/exp-bbv/tests/ppc32-linux/million.S b/exp-bbv/tests/ppc32-linux/million.S
new file mode 100644
index 0000000..e334e86
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/million.S
@@ -0,0 +1,23 @@
+
+     	     # count for 1 million instructions
+	     #   total is 3 + 499997*2 + 3
+	     
+	.globl _start	
+_start:	
+	nop				# to give us an even million
+	lis	15,499997@ha		# load high 16-bits of counter
+	addi	15,15,499997@l		# load low 16-bits of counter
+test_loop:	
+	addic.	15,15,-1		# decrement counter		
+	bne	0,test_loop		# loop until zero
+
+	#================================
+	# Exit
+	#================================
+
+exit:
+        li      3,0             # 0 exit value
+	li      0,1             # put the exit syscall number (1) in r0
+	sc                      # and exit
+			
+
diff --git a/exp-bbv/tests/ppc32-linux/million.post.exp b/exp-bbv/tests/ppc32-linux/million.post.exp
new file mode 100644
index 0000000..260eee6
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/million.post.exp
@@ -0,0 +1,18 @@
+T:1:5   :2:99996   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+
+
+# Thread 1
+#   Total intervals: 10 (Interval Size 100000)
+#   Total instructions: 1000000
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
diff --git a/exp-bbv/tests/ppc32-linux/million.stderr.exp b/exp-bbv/tests/ppc32-linux/million.stderr.exp
new file mode 100644
index 0000000..adeb35d
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/million.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 10 (Interval Size 100000)
+#   Total instructions: 1000000
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/ppc32-linux/million.vgtest b/exp-bbv/tests/ppc32-linux/million.vgtest
new file mode 100644
index 0000000..c366a8b
--- /dev/null
+++ b/exp-bbv/tests/ppc32-linux/million.vgtest
@@ -0,0 +1,5 @@
+prog: million 
+vgopts: --interval-size=100000 --bb-out-file=million.out.bb
+post:	cat million.out.bb
+cleanup: rm million.out.bb
+
diff --git a/exp-bbv/tests/x86-linux/Makefile.am b/exp-bbv/tests/x86-linux/Makefile.am
new file mode 100644
index 0000000..f2971e1
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/Makefile.am
@@ -0,0 +1,24 @@
+include $(top_srcdir)/Makefile.tool-tests.am
+
+dist_noinst_SCRIPTS = filter_stderr
+
+check_PROGRAMS = \
+	ll clone_test
+
+EXTRA_DIST = \
+	   clone_test.stderr.exp \
+	   clone_test.post.exp \
+	   clone_test.vgtest \
+	   ll.stderr.exp \
+	   ll.stdout.exp \
+	   ll.post.exp \
+	   ll.vgtest
+
+AM_CCASFLAGS += -ffreestanding
+
+LDFLAGS += @FLAG_M32@ -static -nostartfiles -nodefaultlibs
+
+clone_test_SOURCES = clone_test.S
+ll_SOURCES = ll.S
+
+AM_CCASFLAGS += @FLAG_M32@
diff --git a/exp-bbv/tests/x86-linux/clone_test.S b/exp-bbv/tests/x86-linux/clone_test.S
new file mode 100644
index 0000000..c96204a
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/clone_test.S
@@ -0,0 +1,95 @@
+     	     # count for ~1 million instructions thread 1
+	     # count for ~2 million instructions thread 2
+	     # count for additional 500 million each before exit
+	     
+	.globl _start	
+_start:	
+
+	#################################################
+        # 1000 cycles in initial thread                 #
+	#################################################
+	
+	xor	%eax,%eax
+	mov	$499,%ecx		# load counter
+initial_loop:	
+	dec	%ecx			# repeat count times
+	jnz	initial_loop
+
+
+	#####################################################
+	# Spawn a thread!                                   #
+	#####################################################
+clone:
+	mov    $120,%eax		# clone syscall
+	
+	# Note, clone syscall is different than the glibc implementation
+	
+# 	int clone (flags, stack_pointer,parent_tidptr,child_tidptr,tls)
+
+
+	       				# Flags in 
+	       				#/usr/include/bits/sched.h
+					# CLONE_THREAD 0x10000
+					# CLONE_SIGHAND 0x800
+					# CLONE_VM      0x100
+					# above must be called together
+					# Below required for Valgrind
+					# CLONE_FS	 0x200
+					# CLONE_FILES	 0x400
+
+	mov    $0x10f00,%ebx
+	
+
+	mov    $(new_stack+4096),%ecx	 	 	# new stack
+
+	
+
+	mov    $0,%edx		# args (none)
+
+	int    $0x80
+	
+	cmp   $0,%eax		# are we in new thread?
+	jz    thread2		# if so, jump to thrad2
+
+
+	###############################################
+	# thread1                                     #
+	###############################################
+
+thread1:
+
+	mov	$499997,%ecx		# load counter
+thread1_loop:	
+	dec	%ecx			# repeat count times
+	jnz	thread1_loop
+
+	xor     %ebx,%ebx		# we return 0
+	jmp    exit
+	
+thread2:	
+	mov	$999997,%ecx		# load counter
+thread2_loop:	
+	dec	%ecx			# repeat count times
+	jnz	thread2_loop	
+	
+	mov    $5,%ebx			# we return 5
+	
+	
+	#================================
+	# Exit
+	#================================
+exit:
+
+     	# count an additional 500 million
+
+	mov	$250000,%ecx		# load counter
+exit_loop:	
+	dec	%ecx			# repeat count times
+	jnz	exit_loop	
+
+actual_exit:
+	mov	$1,%eax		# put exit syscall number (60) in rax
+	int	$0x80
+
+.bss
+.lcomm	new_stack,4096
diff --git a/exp-bbv/tests/x86-linux/clone_test.post.exp b/exp-bbv/tests/x86-linux/clone_test.post.exp
new file mode 100644
index 0000000..55bcf61
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/clone_test.post.exp
@@ -0,0 +1,58 @@
+T 4    996    5    2    3    98991   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 1001    2    3    98994   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+
+
+# Thread 1
+#   Total intervals: 15 (Interval Size 100000)
+#   Total instructions: 1501007
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
+T 2    3    99996   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 99996    4   
+T 100000   
+T 100000   
+T 100000   
+T 100000   
+T 99998    2   
+
+
+# Thread 2
+#   Total intervals: 25 (Interval Size 100000)
+#   Total instructions: 2500001
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
diff --git a/exp-bbv/tests/x86-linux/clone_test.stderr.exp b/exp-bbv/tests/x86-linux/clone_test.stderr.exp
new file mode 100644
index 0000000..6a917a2
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/clone_test.stderr.exp
@@ -0,0 +1,12 @@
+# Thread 1
+#   Total intervals: 15 (Interval Size 100000)
+#   Total instructions: 1501007
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+# Thread 2
+#   Total intervals: 25 (Interval Size 100000)
+#   Total instructions: 2500001
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/x86-linux/clone_test.vgtest b/exp-bbv/tests/x86-linux/clone_test.vgtest
new file mode 100644
index 0000000..9f5cd4d
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/clone_test.vgtest
@@ -0,0 +1,5 @@
+prog: clone_test
+vgopts: --interval-size=100000 --bb-out-file=clone_test.out.bb --pc-out-file=clone_test.out.pc
+post:	cat clone_test.out.bb clone_test.out.bb.2 | ../filter_bb
+cleanup: rm clone_test.out.bb
+
diff --git a/exp-bbv/tests/x86-linux/filter_stderr b/exp-bbv/tests/x86-linux/filter_stderr
new file mode 100644
index 0000000..1c07666
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/filter_stderr
@@ -0,0 +1,5 @@
+#! /bin/sh
+
+../filter_stderr
+
+
diff --git a/exp-bbv/tests/x86-linux/ll.S b/exp-bbv/tests/x86-linux/ll.S
new file mode 100644
index 0000000..8958521
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/ll.S
@@ -0,0 +1,608 @@
+#
+#  linux_logo in i386 assembly language
+#    based on the code from ll_asm-0.36
+#
+#  By Vince Weaver <vince _at_ deater.net>
+#
+# Modified to remove non-deterministic system calls
+# And to avoid reading from /proc
+#
+
+.include "../logo.include"
+
+# offsets into the results returned by the uname syscall
+.equ U_SYSNAME,0
+.equ U_NODENAME,65
+.equ U_RELEASE,65*2
+.equ U_VERSION,(65*3)
+.equ U_MACHINE,(65*4)
+.equ U_DOMAINNAME,65*5
+
+# offset into the results returned by the sysinfo syscall
+.equ S_TOTALRAM,16
+
+# Sycscalls
+.equ SYSCALL_EXIT,     1
+.equ SYSCALL_WRITE,    4
+
+#
+.equ STDIN,0
+.equ STDOUT,1
+.equ STDERR,2
+
+	.globl _start	
+_start:	
+	#=========================
+	# PRINT LOGO
+	#=========================
+
+# LZSS decompression algorithm implementation
+# by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989
+# optimized some more by Vince Weaver
+
+	# we used to fill the buffer with FREQUENT_CHAR
+	# but, that only gains us one byte of space in the lzss image.
+	# the lzss algorithm does automatic RLE... pretty clever
+	# so we compress with NUL as FREQUENT_CHAR and it is pre-done for us
+
+	mov     $(N-F), %bp   	     	# R
+
+	mov  	$logo, %esi		# %esi points to logo (for lodsb)
+
+	mov	$out_buffer, %edi	# point to out_buffer
+	push	%edi	     		# save this value for later
+
+decompression_loop:	
+	lodsb			# load in a byte
+
+	mov 	$0xff, %bh	# re-load top as a hackish 8-bit counter
+	mov 	%al, %bl	# move in the flags
+
+test_flags:
+	cmp	$logo_end, %esi # have we reached the end?
+	je	done_logo  	# if so, exit
+
+	shr 	$1, %ebx	# shift bottom bit into carry flag
+	jc	discrete_char	# if set, we jump to discrete char
+
+offset_length:
+	lodsw                   # get match_length and match_position
+	mov %eax,%edx		# copy to edx
+	    			# no need to mask dx, as we do it
+				# by default in output_loop
+	
+	shr $(P_BITS),%eax	
+	add $(THRESHOLD+1),%al
+	mov %al,%cl             # cl = (ax >> P_BITS) + THRESHOLD + 1
+				#                       (=match_length)
+		
+output_loop:
+	and 	$POSITION_MASK,%dh  	# mask it
+	mov 	text_buf(%edx), %al	# load byte from text_buf[]
+	inc 	%edx	    		# advance pointer in text_buf
+store_byte:	
+	stosb				# store it
+	
+	mov     %al, text_buf(%ebp)	# store also to text_buf[r]
+	inc 	%ebp 			# r++
+	and 	$(N-1), %bp		# mask r
+
+	loop 	output_loop		# repeat until k>j
+	
+	or	%bh,%bh			# if 0 we shifted through 8 and must
+	jnz	test_flags		# re-load flags
+	
+	jmp 	decompression_loop
+
+discrete_char:
+	lodsb				# load a byte
+	inc	%ecx			# we set ecx to one so byte
+					# will be output once
+					# (how do we know ecx is zero?)
+					
+	jmp     store_byte              # and cleverly store it
+
+
+# end of LZSS code
+
+done_logo:
+
+	pop 	%ebp			# get out_buffer and keep in bp
+	mov	%ebp,%ecx		# move out_buffer to ecx
+
+	call	write_stdout		# print the logo
+
+	#
+	#  Setup
+	#
+setup:
+	mov	$strcat,%edx		# use edx as call pointer
+
+	
+	#==========================
+	# PRINT VERSION
+	#==========================
+	
+#	push 	$SYSCALL_UNAME		# uname syscall
+#	pop	%eax			# in 3 bytes	
+#	mov	$uname_info,%ebx	# uname struct
+#	int	$0x80			# do syscall
+
+	mov	%ebp,%edi		# point %edi to out_buffer
+		
+	mov	$(uname_info+U_SYSNAME),%esi	# os-name from uname "Linux"
+	call	*%edx			# call strcat
+
+	mov	$ver_string,%esi		# source is " Version "
+	call 	*%edx			        # call strcat
+	push	%esi  				# save our .txt pointer
+	
+	mov	$(uname_info+U_RELEASE),%esi    # version from uname "2.4.1"
+	call 	*%edx				# call strcat
+	
+	pop	%esi  			# restore .txt pointer
+					# source is ", Compiled "
+	call 	*%edx			# call strcat
+	push	%esi  			# store for later
+
+	mov	$(uname_info+U_VERSION),%esi	# compiled date
+	call 	*%edx			# call strcat
+
+	mov	%ebp,%ecx		# move out_buffer to ecx
+
+	mov	$0xa,%ax		# store linefeed on end
+	stosw				# and zero			  
+
+	call	*%edx			# call strcat
+	
+	call	center_and_print	# center and print
+
+	#===============================
+	# Middle-Line
+	#===============================
+	
+	#=========
+	# Load /proc/cpuinfo into buffer
+	#=========
+
+	push	%edx			# save call pointer
+
+#	push	$SYSCALL_OPEN		# load 5 [ open() ]
+#	pop	%eax			# in 3 bytes
+	
+#	mov	$cpuinfo,%ebx		# '/proc/cpuinfo'
+#	xor	%ecx,%ecx		# 0 = O_RDONLY <bits/fcntl.h>
+#	cdq				# clear edx in clever way
+#	int	$0x80			# syscall.  fd in eax.  
+					# we should check that eax>=0
+					
+#	mov	%eax,%ebx		# save our fd
+	
+#	push	$SYSCALL_READ		# load 3 = read()
+#	pop	%eax			# in 3 bytes
+	
+	mov	$disk_buffer,%ecx
+
+#	mov	$16,%dh		 	# 4096 is maximum size of proc file #)
+					# we load sneakily by knowing
+					# 16<<8 = 4096. be sure edx clear
+
+
+#	int	$0x80
+
+#	push	$SYSCALL_CLOSE		# close (to be correct)
+#	pop	%eax
+#	int	$0x80			
+
+	#=============
+	# Number of CPUs
+	#=============
+number_of_cpus:
+
+	xor	%ebx,%ebx		# chip count
+	
+					# $disk_buffer still in ecx
+bogo_loop:	
+	mov	(%ecx), %eax		# load 4 bytes into eax
+	inc	%ecx			# increment pointer
+	
+	cmp	$0,%al			# check for end of file
+	je	done_bogo
+	
+	cmp	$('o'<<24+'g'<<16+'o'<<8+'b'),%eax	
+				        # "bogo" in little-endian
+					
+	jne	bogo_loop		# if not equal, keep going
+	
+	inc	%ebx			# otherwise, we have a bogo
+	inc	%ebx			# times two for future magic
+	jmp	bogo_loop
+
+done_bogo:
+	lea	one-6(%ebx,%ebx,2), %esi	
+				    	# Load into esi
+					# [one]+(num_cpus*6)
+					#
+					# the above multiplies by three
+					# esi = (ebx+(ebx*2))
+	 				# and we double-incremented ebx 
+					# earlier
+	 
+	mov	%ebp,%edi		# move output buffer to edi
+
+	pop	%edx			# restore call pointer
+	call	*%edx			# copy it (call strcat)
+
+	mov	$' ',%al		# print a space
+	stosb
+
+	push %ebx			# store cpu count
+	push %edx			# store strcat pointer
+
+	#=========
+	# MHz
+	#=========
+print_mhz:
+	mov	$('z'<<24+'H'<<16+'M'<<8+' '),%ebx	
+			   		# find ' MHz' and grab up to .
+	                                # we are little endian
+	mov	$'.',%ah
+
+	# below is same as "sub $(strcat-find_string),%edx
+	# gas won't let us force the one-byte constant
+	.byte 0x83,0xEA,strcat-find_string   
+	
+	call	*%edx			# call find string
+
+	mov	%ebx,%eax  		# clever way to get MHz in, sadly
+	ror	$8,%eax			# not any smaller than a mov
+	stosl	    			
+
+	#=========
+	# Chip Name
+	#=========
+chip_name:	
+
+	# because of ugly newer cpuinfos from intel I had to hack this
+	# now we grab the first two words in the name field and use that
+	# it works on all recent Intel and AMD chips.  Older things
+	# might choke
+
+	mov	$('e'<<24+'m'<<16+'a'<<8+'n'),%ebx     	
+					# find 'name\t: ' and grab up to \n
+					# we are little endian
+	mov	$' ',%ah
+	call	*%edx	   		# print first word
+	stosb				# store a space
+	call	skip_spaces		# print next word
+
+	pop	%edx
+	pop	%ebx			# restore chip count
+	pop	%esi
+	
+	call	*%edx			# ' Processor'
+	cmpb	$2,%bl	
+	jne	print_s
+	inc	%esi   			# if singular, skip the s
+print_s:	
+	call	*%edx			# 's, '
+
+	push	%esi			# restore the values
+	push 	%edx
+	
+	#========
+	# RAM
+	#========
+	
+#	push    $SYSCALL_SYSINFO	# sysinfo() syscall
+#	pop	%eax	
+#	mov	$sysinfo_buff,%ebx	
+#	int	$0x80
+	
+	mov	(sysinfo_buff+S_TOTALRAM),%eax	# size in bytes of RAM
+	shr	$20,%eax		# divide by 1024*1024 to get M
+	adc	$0, %eax		# round 
+
+
+	call num_to_ascii
+	
+	pop  %edx	 		# restore strcat pointer
+	
+	pop     %esi	 		# print 'M RAM, '
+	call	*%edx			# call strcat
+
+	push	%esi
+	
+
+	#========
+	# Bogomips
+	#========
+	
+	mov	$('s'<<24+'p'<<16+'i'<<8+'m'),%ebx      	
+					# find 'mips\t: ' and grab up to \n
+	mov	$0xa,%ah
+	call	find_string
+
+	pop	%esi	   		# bogo total follows RAM 
+
+	call 	*%edx			# call strcat
+
+	push	%esi
+
+	mov	%ebp,%ecx		# point ecx to out_buffer
+
+
+	call	center_and_print	# center and print
+
+	#=================================
+	# Print Host Name
+	#=================================
+
+	mov     %ebp,%edi		  # point to output_buffer
+	
+	mov	$(uname_info+U_NODENAME),%esi	# host name from uname()
+	call    *%edx			  # call strcat
+	
+		      			# ecx is unchanged
+	call	center_and_print	# center and print
+	
+	pop	%ecx			# (.txt) pointer to default_colors
+	
+	call	write_stdout
+	
+
+	#================================
+	# Exit
+	#================================
+exit:
+	xor     %ebx,%ebx
+	xor	%eax,%eax
+	inc	%eax	 		# put exit syscall number (1) in eax
+	int     $0x80             	# and exit
+
+
+	#=================================
+	# FIND_STRING 
+	#=================================
+	#   ah is char to end at
+	#   ebx is 4-char ascii string to look for
+	#   edi points at output buffer
+
+find_string:
+					
+	mov	$disk_buffer-1,%esi	# look in cpuinfo buffer
+find_loop:
+	inc	%esi
+	cmpb	$0, (%esi)		# are we at EOF?
+	je	done			# if so, done
+
+	cmp	(%esi), %ebx		# do the strings match?
+	jne	find_loop		# if not, loop
+	
+					# ! if we get this far, we matched
+
+find_colon:	   			
+	lodsb				# repeat till we find colon
+	cmp	$0,%al			# this is actually smaller code
+	je	done			#   than an or ecx/repnz scasb
+	cmp	$':',%al
+	jne	find_colon
+
+
+skip_spaces:
+        lodsb                           # skip spaces
+	cmp     $0x20,%al               # Loser new intel chips have lots??
+        je      skip_spaces
+
+store_loop:	 
+	cmp	$0,%al
+	je	done
+	cmp	%ah,%al			# is it end string?
+	je 	almost_done		# if so, finish
+	cmp	$'\n',%al		# also end if linefeed
+	je	almost_done
+	stosb				# if not store and continue
+	lodsb				# load value	
+	jmp	store_loop
+	 
+almost_done:	 
+
+	movb	 $0, (%edi)	        # replace last value with NUL 
+done:
+	ret
+
+
+	#================================
+	# strcat
+	#================================
+
+strcat:
+	lodsb				# load a byte from [ds:esi]
+	stosb				# store a byte to [es:edi]
+	cmp	$0,%al			# is it zero?
+	jne	strcat			# if not loop
+	dec	%edi			# point to one less than null
+	ret				# return
+
+	#==============================
+	# center_and_print
+	#==============================
+	# string to center in ecx
+
+center_and_print:
+	push    %edx
+	push	%ecx			# save the string pointer
+	inc	%edi			# move to a clear buffer
+	push	%edi			# save for later
+
+	mov	$('['<<8+27),%ax	# we want to output ^[[
+	stosw
+
+	cdq	      			# clear dx
+	
+str_loop2:				# find end of string	
+	inc	%edx
+	cmpb	$0,(%ecx,%edx)		# repeat till we find zero
+	jne	str_loop2
+	
+	push	$81	 		# one added to cheat, we don't
+					# count the trailing '\n'
+	pop	%eax
+	
+	cmp	%eax,%edx		# see if we are >=80
+	jl	not_too_big		# if so, don't center
+	push	$80
+	pop	%edx
+	
+not_too_big:			
+	sub	%edx,%eax		# subtract size from 80
+	
+	shr	%eax			# then divide by 2
+	
+	call	num_to_ascii		# print number of spaces
+	mov	$'C',%al		# tack a 'C' on the end
+					# ah is zero from num_to_ascii
+	stosw				# store C and a NULL
+	pop  %ecx			# pop the pointer to ^[[xC
+	
+	call write_stdout		# write to the screen
+	
+done_center:
+	pop  %ecx			# restore string pointer
+	     				# and trickily print the real string
+
+	pop %edx
+
+	#================================
+	# WRITE_STDOUT
+	#================================
+	# ecx has string
+	# eax,ebx,ecx,edx trashed
+write_stdout:
+	push    %edx
+	push	$SYSCALL_WRITE		# put 4 in eax (write syscall)
+	pop     %eax     		# in 3 bytes of code
+	
+	cdq   	      			# clear edx
+	
+	xor	%ebx,%ebx		# put 1 in ebx (stdout)
+	inc	%ebx			# in 3 bytes of code
+	
+			# another way of doing this:    lea 1(%edx), %ebx
+
+str_loop1:
+	inc	%edx
+	cmpb	$0,(%ecx,%edx)		# repeat till zero
+	jne	str_loop1
+
+	int	$0x80  			# run the syscall
+	pop	%edx
+	ret
+
+	##############################
+	# num_to_ascii
+	##############################
+	# ax = value to print
+	# edi points to where we want it
+	
+num_to_ascii:
+	push    $10
+	pop     %ebx
+	xor     %ecx,%ecx       # clear ecx
+div_by_10:
+	cdq                     # clear edx
+	div     %ebx            # divide
+	push    %edx            # save for later
+	inc     %ecx            # add to length counter
+	or      %eax,%eax       # was Q zero?
+	jnz     div_by_10       # if not divide again
+	
+write_out:
+	pop     %eax            # restore in reverse order
+	add     $0x30, %al      # convert to ASCII
+	stosb                   # save digit
+	loop    write_out       # loop till done
+	ret
+
+#===========================================================================
+#	section .data
+#===========================================================================
+.data
+
+ver_string:	.ascii	" Version \0"
+compiled_string:	.ascii	", Compiled \0"
+processor:		.ascii " Processor\0"
+s_comma:		.ascii "s, \0"
+ram_comma:	.ascii	"M RAM, \0"
+bogo_total:	.ascii	" Bogomips Total\n\0"
+
+default_colors:	.ascii "\033[0m\n\n\0"
+
+cpuinfo:	.ascii	"/proc/cpuinfo\0"
+
+
+one:	.ascii	"One\0\0\0"
+two:	.ascii	"Two\0\0\0"
+three:	.ascii	"Three\0"
+four:	.ascii	"Four\0"
+
+.include	"../logo.lzss_new"
+
+disk_buffer:
+.ascii "processor	: 0\n"
+.ascii "vendor_id	: AuthenticAMD\n"
+.ascii "cpu family	: 6\n"
+.ascii "model		: 6\n"
+.ascii "model name	: AMD Athlon(tm) XP 2000+\n"
+.ascii "stepping	: 2\n"
+.ascii "cpu MHz		: 1665.267\n"
+.ascii "cache size	: 256 KB\n"
+.ascii "fdiv_bug	: no\n"
+.ascii "hlt_bug		: no\n"
+.ascii "f00f_bug	: no\n"
+.ascii "coma_bug	: no\n"
+.ascii "fpu		: yes\n"
+.ascii "fpu_exception	: yes\n"
+.ascii "cpuid level	: 1\n"
+.ascii "wp		: yes\n"
+.ascii "flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow up\n"
+.ascii "bogomips	: 3330.53\n"
+.ascii "clflush size	: 32\n"
+.ascii "power management: ts\n\0"
+
+uname_info:
+.ascii "Linux\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "tobler\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "2.6.29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "#1 SMP Mon May 4 09:51:54 EDT 2009\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+
+sysinfo_buff:
+.long 0,0,0,0,512*1024*1024,0,0,0,0
+.long 0,0,0,0,0,0,0,0,0
+
+#============================================================================
+#	section .bss
+#============================================================================
+.bss
+
+.lcomm  text_buf, (N+F-1)
+.lcomm	out_buffer,16384
+
+
+
+
+
diff --git a/exp-bbv/tests/x86-linux/ll.post.exp b/exp-bbv/tests/x86-linux/ll.post.exp
new file mode 100644
index 0000000..6c1fd4d
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/ll.post.exp
@@ -0,0 +1,48 @@
+T:1:9   :7:10   :5:38   :2:44   :8:65   :9:663   :4:119   :6:2   :3:51   
+T:7:5   :5:16   :2:18   :8:52   :9:858   :4:35   :6:1   :3:15   
+T:7:5   :5:16   :2:18   :8:52   :9:858   :4:35   :6:1   :3:15   
+T:7:5   :5:14   :2:16   :8:91   :9:863   :4:7   :6:1   :3:3   
+T:7:5   :5:12   :2:14   :8:78   :9:880   :4:7   :6:1   :3:3   
+T:7:5   :5:6   :2:8   :8:52   :9:928   :6:1   
+T:7:5   :5:10   :2:11   :8:65   :9:908   :6:1   
+T:7:5   :5:14   :2:17   :8:117   :9:846   :6:1   
+T:5:8   :2:8   :8:52   :9:932   
+T:7:5   :5:8   :2:10   :8:65   :9:911   :6:1   
+T:5:8   :2:8   :8:52   :9:932   
+T:7:5   :5:6   :2:8   :8:52   :9:928   :6:1   
+T:5:6   :2:6   :8:39   :9:949   
+T:7:5   :5:6   :2:8   :8:52   :9:928   :6:1   
+T:5:4   :2:4   :8:26   :9:966   
+T:7:5   :5:12   :2:14   :8:78   :9:880   :4:7   :6:1   :3:3   
+T:5:6   :2:6   :8:39   :9:949   
+T:7:5   :5:8   :2:10   :8:65   :9:911   :6:1   
+T:7:5   :5:14   :2:16   :8:91   :9:863   :4:7   :6:1   :3:3   
+T:5:8   :2:8   :8:52   :9:932   
+T:7:5   :5:10   :2:12   :8:78   :9:894   :6:1   
+T:7:5   :5:10   :2:12   :8:75   :9:897   :6:1   
+T:5:12   :2:12   :8:81   :9:895   
+T:7:5   :5:8   :2:8   :8:39   :9:389   :4:7   :6:1   :3:3   :10:3   :11:9   :12:528   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:12:1000   
+T:15:4   :18:2   :19:3   :20:2   :21:3   :22:4   :16:283   :17:10   :12:686   :13:1   :14:2   
+T:23:1   :32:7   :34:352   :33:177   :16:1   :17:2   :24:10   :25:195   :26:4   :27:3   :30:4   :31:11   :11:9   :12:204   :13:2   :14:4   :28:9   :29:5   
+T:34:667   :33:333   
+T:34:665   :33:332   :35:3   
+T:34:128   :33:64   :36:4   :37:8   :49:6   :38:8   :40:407   :39:274   :41:21   :42:14   :43:6   :44:10   :45:10   :46:8   :47:12   :48:2   :16:16   :17:2   
+T:50:2   :51:4   :52:2   :53:2   :54:6   :56:3   :57:4   :38:4   :40:405   :39:272   :41:18   :42:12   :43:9   :44:30   :45:30   :46:26   :47:39   :48:4   :16:88   :17:6   :28:9   :55:12   :29:13   
+T:40:600   :39:400   
+T:58:2   :59:3   :40:352   :39:236   :41:18   :42:12   :43:6   :44:16   :45:16   :46:14   :47:21   :48:2   :16:68   :17:2   :24:10   :25:210   :26:4   :27:3   :28:5   
+
+
+# Thread 1
+#   Total intervals: 39 (Interval Size 1000)
+#   Total instructions: 39439
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
diff --git a/exp-bbv/tests/x86-linux/ll.stderr.exp b/exp-bbv/tests/x86-linux/ll.stderr.exp
new file mode 100644
index 0000000..a78db79
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/ll.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 39 (Interval Size 1000)
+#   Total instructions: 39439
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/x86-linux/ll.stdout.exp b/exp-bbv/tests/x86-linux/ll.stdout.exp
new file mode 100644
index 0000000..42415bc
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/ll.stdout.exp
@@ -0,0 +1,17 @@
+###############################################################################
+###############################################################################
+##################################################################O#O##########
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+###############################################################################
+
+Linux Version 2.6.29, Compiled #1 SMP Mon May 4 09:51:54 EDT 2009
+One 1665MHz AMD Athlon(tm) Processor, 512M RAM, 3330.53 Bogomips Total
+tobler
+
diff --git a/exp-bbv/tests/x86-linux/ll.vgtest b/exp-bbv/tests/x86-linux/ll.vgtest
new file mode 100644
index 0000000..6031a58
--- /dev/null
+++ b/exp-bbv/tests/x86-linux/ll.vgtest
@@ -0,0 +1,5 @@
+prog: ll
+vgopts: --interval-size=1000 --bb-out-file=ll.out.bb
+post:	cat ll.out.bb
+cleanup: rm ll.out.bb
+
diff --git a/exp-bbv/tests/x86/Makefile.am b/exp-bbv/tests/x86/Makefile.am
new file mode 100644
index 0000000..3857aae
--- /dev/null
+++ b/exp-bbv/tests/x86/Makefile.am
@@ -0,0 +1,28 @@
+include $(top_srcdir)/Makefile.tool-tests.am
+
+dist_noinst_SCRIPTS = filter_stderr
+
+check_PROGRAMS = \
+	million rep_prefix fldcw_check complex_rep
+
+EXTRA_DIST = \
+	   complex_rep.stderr.exp \
+	   complex_rep.vgtest \
+	   fldcw_check.stderr.exp \
+	   fldcw_check.vgtest \
+	   million.stderr.exp \
+	   million.post.exp \
+	   million.vgtest \
+	   rep_prefix.stderr.exp \
+	   rep_prefix.vgtest 
+
+AM_CCASFLAGS += -ffreestanding
+
+LDFLAGS += @FLAG_M32@ -static -nostartfiles -nodefaultlibs
+
+complex_rep_SOURCES = complex_rep.S
+fldcw_check_SOURCES = fldcw_check.S
+million_SOURCES = million.S
+rep_prefix_SOURCES = rep_prefix.S
+
+AM_CCASFLAGS += @FLAG_M32@
diff --git a/exp-bbv/tests/x86/complex_rep.S b/exp-bbv/tests/x86/complex_rep.S
new file mode 100644
index 0000000..fca36e5
--- /dev/null
+++ b/exp-bbv/tests/x86/complex_rep.S
@@ -0,0 +1,62 @@
+# When trying (and failing) to instrument at the basic block level
+# I thought up a lot of corner-cases in the rep code.  This tries
+# to catch some of them
+
+# Performance counters give us 8207 insns
+#    11 + 8*1024 + 3 = 8206
+
+	.globl _start	
+_start:	
+	cld				# we want these to happen forward
+
+	mov    $0xfeb1378,%eax		# value to store
+
+	# test back-to-back rep/stosb's
+
+	mov	$1024,%ecx
+	mov	$buffer1, %edi		# set destination
+	rep	stosb	    		# store 1024 times
+	rep	stosb	    		# should store 0 times	
+	rep	stosb			# should store 0 times
+
+	
+	# test stosb where cx is 0
+	
+	xor    %ecx,%ecx
+	mov    $buffer1, %edi		# set destination
+	rep    stosb	  		# should not load at all
+	
+	# test rep inside of a loop
+	
+	mov    $1024, %ebx
+rep_loop:	
+
+	mov    $1024,%ecx
+	mov    $buffer1, %edi		# set destination
+	rep    stosb
+	
+	mov    $1024,%ecx
+	mov    $buffer1, %edi		# set destination
+	rep    stosb
+
+	dec    %ebx
+	jnz    rep_loop
+	
+	
+	#================================
+	# Exit
+	#================================
+exit:
+     	mov	$1,%eax
+#ifdef VGO_darwin
+	pushl	$0
+#else	
+	xor     %ebx,%ebx		# we return 0
+#endif	
+	int	$0x80          		# and exit
+
+
+#.bss
+
+.lcomm	buffer1,	16384
+
diff --git a/exp-bbv/tests/x86/complex_rep.stderr.exp b/exp-bbv/tests/x86/complex_rep.stderr.exp
new file mode 100644
index 0000000..ceabe14
--- /dev/null
+++ b/exp-bbv/tests/x86/complex_rep.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 0 (Interval Size 100000)
+#   Total instructions: 8206
+#   Total reps: 2100228
+#   Unique reps: 2052
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/x86/complex_rep.vgtest b/exp-bbv/tests/x86/complex_rep.vgtest
new file mode 100644
index 0000000..ef5ac30
--- /dev/null
+++ b/exp-bbv/tests/x86/complex_rep.vgtest
@@ -0,0 +1,4 @@
+prog: complex_rep
+vgopts: --interval-size=100000 --bb-out-file=complex_rep.out.bb
+cleanup: rm complex_rep.out.bb
+
diff --git a/exp-bbv/tests/x86/filter_stderr b/exp-bbv/tests/x86/filter_stderr
new file mode 100644
index 0000000..1c07666
--- /dev/null
+++ b/exp-bbv/tests/x86/filter_stderr
@@ -0,0 +1,5 @@
+#! /bin/sh
+
+../filter_stderr
+
+
diff --git a/exp-bbv/tests/x86/fldcw_check.S b/exp-bbv/tests/x86/fldcw_check.S
new file mode 100644
index 0000000..ef4c3a7
--- /dev/null
+++ b/exp-bbv/tests/x86/fldcw_check.S
@@ -0,0 +1,130 @@
+
+.globl _start
+
+_start:
+        # This code tests for the fldcw "load floating point command word"
+	#   instruction.  On most x86 processors the retired_instruction
+	#   performance counter counts this as one instruction.  However,
+	#   on Pentium 4 systems it counts as two.  Therefore this can
+	#   affect BBV results on such a system.
+	# fldcw is most often used to set the rouding mode when doing
+	#   floating point to integer conversions
+	
+	# It is encoded as "d9 /5" which means
+	#   1101 1001 xx10 1yyy
+	# Where xx is the "mod" which will be 00, 01, or 10 indicating offset
+	#   and yyy is the register field
+
+
+
+        # these are instructions with similar encodings to fldcw
+	# that can cause false positives if the test isn't explicit enough
+similar:	
+        fld1   	   	       		# d9 e8
+	fldl2t				# d9 e9
+	fldl2e				# d9 ea
+	fldpi				# d9 eb
+	fldlg2				# d9 ec
+	fldln2				# d9 ed
+	fldz				# d9 ee
+
+	# check some varied ways of calling fldcw
+
+
+	# offset on stack
+stack:	
+	sub	$4,%esp			# allocate space on stack
+	fnstcw	2(%esp)		
+	fldcw	2(%esp)		
+	add	$4,%esp			# restore stack
+	
+	# 32-bit register
+	
+	fnstcw	cw
+	mov	$cw,%eax
+	fldcw	0(%eax)			# eax
+	mov	$cw,%ebx
+	fldcw	0(%ebx)			# ebx
+	mov	$cw,%ecx	
+	fldcw	0(%ecx)			# ecx
+	mov	$cw,%edx		 
+	fldcw	0(%edx)			# edx
+	
+	# register + 8-bit offset
+eight_bit:	
+	mov	$cw,%eax
+	sub	$32,%eax
+	
+	fldcw	32(%eax)		# eax + 8 bit offset
+	mov	%eax,%ebx
+	fldcw	32(%ebx)		# ebx + 8 bit offset	
+	mov	%eax,%ecx
+	fldcw	32(%ecx)		# ecx + 8 bit offset		
+	mov	%eax,%edx
+	fldcw	32(%edx)		# edx + 8 bit offset
+	
+	# register + 32-bit offset
+thirtytwo_bit:	
+	mov	$cw,%eax
+	sub	$30000,%eax
+	
+	fldcw	30000(%eax)		# eax + 16 bit offset
+	mov	%eax,%ebx
+	fldcw	30000(%ebx)		# ebx + 16 bit offset	
+	mov	%eax,%ecx
+	fldcw	30000(%ecx)		# ecx + 16 bit offset		
+	mov	%eax,%edx
+	fldcw	30000(%edx)		# edx + 16 bit offset			
+
+	# check an fp/integer conversion
+	# in a loop to give a bigger count
+
+	mov	$1024,%ecx
+big_loop:
+
+	fldl	three			# load value onto fp stack
+	fnstcw	saved_cw		# store control word to mem
+	movzwl	saved_cw, %eax		# load cw from mem, zero extending
+	movb	$12, %ah		# set cw for "round to zero"
+	movw	%ax, cw			# store back to memory
+	fldcw	cw   			# save new rounding mode
+	fistpl	result			# save stack value as integer to mem
+	fldcw	saved_cw		# restore old cw
+	
+	loop	big_loop		# loop to make the count more obvious
+
+	movl	result, %ebx		# sanity check to see if the
+	cmp	$3,%ebx			# result is the expected one
+	je	exit
+	
+print_error:
+	mov 	$4,%eax			# write syscall
+#ifdef VGO_darwin
+	pushl	$1
+	pushl	$error
+	pushl	$22
+#else	
+	mov	$1,%ebx			# stdout
+	mov	$error,%ecx		# string	
+	mov 	$22,%edx		# length of string
+#endif	
+	int 	$0x80
+	
+exit:
+#ifdef VGO_darwin
+	pushl	result
+#else	
+	movl	result, %ebx		# load converted value
+#endif	
+	movl	$1,	%eax		# SYSCALL_EXIT
+	int	$0x80
+	
+
+
+.data
+saved_cw:	.long 0
+cw:  	.long	0
+result: .long	0
+three:	.long	0			# a floating point 3.0
+	.long	1074266112
+error:	.asciz  "Error!  Wrong result!\n"
diff --git a/exp-bbv/tests/x86/fldcw_check.stderr.exp b/exp-bbv/tests/x86/fldcw_check.stderr.exp
new file mode 100644
index 0000000..c1add90
--- /dev/null
+++ b/exp-bbv/tests/x86/fldcw_check.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 0 (Interval Size 10000)
+#   Total instructions: 9261
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 2061
diff --git a/exp-bbv/tests/x86/fldcw_check.vgtest b/exp-bbv/tests/x86/fldcw_check.vgtest
new file mode 100644
index 0000000..f9bbae9
--- /dev/null
+++ b/exp-bbv/tests/x86/fldcw_check.vgtest
@@ -0,0 +1,4 @@
+prog: fldcw_check
+vgopts: --interval-size=10000 --bb-out-file=fldcw_check.out.bb
+cleanup: rm fldcw_check.out.bb
+
diff --git a/exp-bbv/tests/x86/million.S b/exp-bbv/tests/x86/million.S
new file mode 100644
index 0000000..0d72b00
--- /dev/null
+++ b/exp-bbv/tests/x86/million.S
@@ -0,0 +1,33 @@
+		# many thanks to David Fang
+		# for providing an OSX 10.5 machine to test on
+
+     	     # count for 1 million instructions
+	     #   total is 1 + 1 + 499997*2 + 4
+
+	.globl _start	
+_start:
+	xor	%ecx,%ecx		# not needed, pads total to 1M
+	mov	$499997,%ecx		# load counter
+test_loop:	
+	dec	%ecx			# repeat count times
+	jnz	test_loop
+
+	#================================
+	# Exit
+	#================================
+
+	# syscall numbers in /usr/include/sys/syscall.h on OSX
+	#                 in arc/x86/include/asm/unistd_32.h on Linux
+	# disassemble on OSX otool -tV
+exit:
+#ifdef VGO_darwin
+	pushl   $0			# we return 0
+	xor	%eax,%eax
+	inc	%eax	 		# put exit syscall number (1) in eax
+	int     $0x80             	# and exit
+#else	
+	xor     %ebx,%ebx		# we return 0
+	xor	%eax,%eax
+	inc	%eax	 		# put exit syscall number (1) in eax
+	int     $0x80             	# and exit
+#endif
diff --git a/exp-bbv/tests/x86/million.post.exp b/exp-bbv/tests/x86/million.post.exp
new file mode 100644
index 0000000..6eb56fc
--- /dev/null
+++ b/exp-bbv/tests/x86/million.post.exp
@@ -0,0 +1,18 @@
+T:1:4   :2:99997   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+T:2:100000   
+
+
+# Thread 1
+#   Total intervals: 10 (Interval Size 100000)
+#   Total instructions: 1000000
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
+
diff --git a/exp-bbv/tests/x86/million.stderr.exp b/exp-bbv/tests/x86/million.stderr.exp
new file mode 100644
index 0000000..adeb35d
--- /dev/null
+++ b/exp-bbv/tests/x86/million.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 10 (Interval Size 100000)
+#   Total instructions: 1000000
+#   Total reps: 0
+#   Unique reps: 0
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/x86/million.vgtest b/exp-bbv/tests/x86/million.vgtest
new file mode 100644
index 0000000..fc91c77
--- /dev/null
+++ b/exp-bbv/tests/x86/million.vgtest
@@ -0,0 +1,5 @@
+prog: million 
+vgopts: --interval-size=100000 --bb-out-file=million.out.bb --pc-out-file=million.out.pc
+post:	cat million.out.bb
+cleanup: rm million.out.bb million.out.pc
+
diff --git a/exp-bbv/tests/x86/rep_prefix.S b/exp-bbv/tests/x86/rep_prefix.S
new file mode 100644
index 0000000..346248c
--- /dev/null
+++ b/exp-bbv/tests/x86/rep_prefix.S
@@ -0,0 +1,280 @@
+#
+# rep, repe (repz) and repne (repnz) prefixed string instructions
+#   only count as one instruction, even though they repeat many times
+# This test makes sure the bbv plugin counts these instructions properly
+# The answer is validated to hw perf counters.
+#
+
+	.globl _start	
+_start:	
+	cld				# we want these to happen forward
+
+	#===================================
+	# Check varied order of the size prefix
+	#   with the rep prefix.  Older binutils
+	#   did this one way, newer binutils the other
+	#===================================
+	
+size_prefix:
+	# test 16-bit load
+	
+	mov	$8192, %ecx
+	mov	$buffer1, %esi		# set source
+	.byte 0x66, 0xf3, 0xad		# lodsw
+	
+	mov	$8192, %ecx
+	mov	$buffer1, %esi		# set source
+	.byte 0xf3, 0x66, 0xad		# lodsw	
+	
+	
+	
+
+	#===================================
+	# Load and Store Instructions
+	#===================================
+loadstore:
+	xor	%eax, %eax
+	mov	$0xd, %al		# set eax to d
+	
+	# test 8-bit store
+	
+	mov	$16384, %ecx
+	mov	$buffer1, %edi		# set destination
+	rep	stosb	    		# store d 16384 times, auto-increment
+	
+	# test 8-bit load
+	
+	mov	$16384, %ecx
+	mov	$buffer1, %esi		# set source
+	rep	lodsb	    		# load byte 16384 times, auto-increment
+
+	cmp	$0xd,%al		# if we loaded wrong value
+	jne	print_error		# print an error
+
+	# test 16-bit store
+	
+	mov    	$0x020d,%ax		# store 0x020d
+	
+	mov	$8192, %ecx
+	mov	$buffer1, %edi		# set destination
+	rep	stosw	    		# store 8192 times, auto-increment
+	
+	# test 16-bit load
+	
+	mov	$8192, %ecx
+	mov	$buffer1, %esi		# set source
+	rep	lodsw	    		# load 8192 times, auto-increment
+
+	cmp	$0x020d,%ax		# if we loaded wrong value
+	jne	print_error		# print an error
+	
+	# test 32-bit store
+	
+	mov    	$0x0feb1378,%eax	# store 0x0feb1378
+	
+	mov	$4096, %ecx
+	mov	$buffer1, %edi		# set destination
+	rep	stosl	    		# store 4096 times, auto-increment
+	
+	# test 32-bit load
+	
+	mov	$4096, %ecx
+	mov	$buffer1, %esi		# set source
+	rep	lodsl	    		# load 4096 times, auto-increment
+
+	cmp	$0x0feb1378,%eax	# if we loaded wrong value
+	jne	print_error		# print an error
+
+	#=============================
+	# Move instructions
+	#=============================
+moves:
+	# test 8-bit move
+	
+	mov    $16384, %ecx
+	mov    $buffer1, %esi
+	mov    $buffer2, %edi
+	rep    movsb
+	
+	# test 16-bit move
+	
+	mov    $8192, %ecx
+	mov    $buffer2, %esi
+	mov    $buffer1, %edi
+	rep    movsw
+	
+	# test 32-bit move
+	
+	mov    $4096, %ecx
+	mov    $buffer1, %esi
+	mov    $buffer2, %edi
+	rep    movsl	
+	
+	#==================================
+	# Compare equal instructions
+	#==================================
+compare_equal:	
+	# first set up the areas to compare
+	
+	mov	$0xa5a5a5a5,%eax
+	mov	$buffer1, %edi
+	mov	$4096, %ecx
+	rep	stosl
+	
+	mov	$0xa5a5a5a5,%eax
+	mov	$buffer2, %edi
+	mov	$4096, %ecx
+	rep	stosl
+	
+	# test 8-bit
+	
+	mov	$buffer1,%esi
+	mov	$buffer2,%edi
+	mov	$16384, %ecx
+	repe	cmpsb
+	jnz	print_error
+	
+	# test 16-bit
+	
+	mov	$buffer1,%esi
+	mov	$buffer2,%edi
+	mov	$8192, %ecx
+	repe	cmpsw
+	jnz	print_error	
+	
+	# test 32-bit
+	
+	mov	$buffer1,%esi
+	mov	$buffer2,%edi
+	mov	$4096, %ecx
+	repe	cmpsl
+	jnz	print_error		
+	
+	#==================================
+	# Compare not equal instructions
+	#==================================
+compare_noteq:	
+	# change second buffer
+	
+	mov	$0x5a5a5a5a,%eax
+	mov	$buffer2, %edi
+	mov	$4096, %ecx
+	rep	stosl
+	
+	# test 8-bit
+	
+	mov	$buffer1,%esi
+	mov	$buffer2,%edi
+	mov	$16384, %ecx
+	repne	cmpsb
+	je	print_error
+	
+	# test 16-bit
+	
+	mov	$buffer1,%esi
+	mov	$buffer2,%edi
+	mov	$8192, %ecx
+	repne	cmpsw
+	je	print_error	
+	
+	# test 32-bit
+	
+	mov	$buffer1,%esi
+	mov	$buffer2,%edi
+	mov	$4096, %ecx
+	repne	cmpsl
+	je	print_error			
+	
+	#====================================
+	# Check scan equal instruction
+	#====================================
+
+	# test 8-bit
+
+	mov     $0xa5,%al
+	mov	$buffer1,%edi
+	mov	$16384, %ecx
+	repe	scasb
+	jnz	print_error
+	
+	# test 16-bit
+	
+	mov     $0xa5a5,%ax
+	mov	$buffer1,%edi
+	mov	$8192, %ecx
+	repe	scasw
+	jnz	print_error	
+	
+	# test 32-bit
+	
+	mov	$0xa5a5a5a5,%eax
+	mov	$buffer1,%edi
+	mov	$4096, %ecx
+	repe	scasl
+	jnz	print_error		
+
+	#====================================
+	# Check scan not-equal instruction
+	#====================================
+
+	# test 8-bit
+
+	mov     $0xa5,%al
+	mov	$buffer2,%edi
+	mov	$16384, %ecx
+	repne	scasb
+	jz	print_error
+	
+	# test 16-bit
+	
+	mov     $0xa5a5,%ax
+	mov	$buffer2,%edi
+	mov	$8192, %ecx
+	repne	scasw
+	jz	print_error	
+	
+	# test 32-bit
+	
+	mov	$0xa5a5a5a5,%eax
+	mov	$buffer2,%edi
+	mov	$4096, %ecx
+	repne	scasl
+	jz	print_error		
+
+	jmp	exit			# no error, skip to exit
+	
+print_error:
+	    
+	mov 	$4, %eax		# Write syscall
+#ifdef VGO_darwin
+	pushl	$1
+	pushl	$error_string
+	pushl	$16
+#else	
+	mov	$1, %ebx		# print to stdout
+	mov	$error_string, %ecx	# string to print
+	mov	$16, %edx      	   	# strlen
+#endif	
+	int	$0x80	 		# call syscall
+
+	#================================
+	# Exit
+	#================================
+exit:
+#ifdef VGO_darwin	
+	xor     %ebx,%ebx		# we return 0
+#else
+	pushl	$0			# we return 0
+#endif	
+	xor	%eax,%eax
+	inc	%eax	 		# put exit syscall number (1) in eax
+	int     $0x80             	# and exit
+
+
+.data
+error_string:	.asciz "Error detected!\n"
+
+#.bss
+
+.lcomm	buffer1,	16384
+.lcomm	buffer2,	16384
diff --git a/exp-bbv/tests/x86/rep_prefix.stderr.exp b/exp-bbv/tests/x86/rep_prefix.stderr.exp
new file mode 100644
index 0000000..e71e657
--- /dev/null
+++ b/exp-bbv/tests/x86/rep_prefix.stderr.exp
@@ -0,0 +1,6 @@
+# Thread 1
+#   Total intervals: 0 (Interval Size 100000)
+#   Total instructions: 124
+#   Total reps: 229402
+#   Unique reps: 26
+#   Total fldcw instructions: 0
diff --git a/exp-bbv/tests/x86/rep_prefix.vgtest b/exp-bbv/tests/x86/rep_prefix.vgtest
new file mode 100644
index 0000000..bc89a1c
--- /dev/null
+++ b/exp-bbv/tests/x86/rep_prefix.vgtest
@@ -0,0 +1,4 @@
+prog: rep_prefix
+vgopts: --interval-size=100000 --bb-out-file=rep_prefix.out.bb
+cleanup: rm rep_prefix.out.bb
+