Cachegrind/Callgrind: allow for cache sizes other than only powers of two

The number of sets, ie. number of cache lines divided by associativity,
and the cache line size still have to be powers of two.
This change is needed for default cache parameters used on some Intel
Core 2 and Atom processors.

Includes cachegrind manual update and explicit tests with 24KB D1/3MB L2
Reverts addition of 6MB warning to {cachegrind,callgrind}/tests/filter_stderr

Backporting to VALGRIND_3_4_BRANCH needs r8912

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@9080 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/cachegrind/cg-x86.c b/cachegrind/cg-x86.c
index 873c351..eceec6d 100644
--- a/cachegrind/cg-x86.c
+++ b/cachegrind/cg-x86.c
@@ -113,12 +113,7 @@
 
       case 0x0a: *D1c = (cache_t) {  8, 2, 32 }; break;
       case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
-      case 0x0e:
-         /* Real D1 cache configuration is:
-            D1c = (cache_t) { 24, 6, 64 }; */
-         VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb");
-         *D1c = (cache_t) { 16, 4, 64 };
-         break;
+      case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break;
       case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
 
       /* IA-64 info -- panic! */
@@ -149,12 +144,7 @@
       case 0x43: *L2c = (cache_t) {  512, 4, 32 }; L2_found = True; break;
       case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
       case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
-      case 0x48:
-         /* Real L2 cache configuration is:
-            *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */
-         VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb");
-         *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True;
-         break;
+      case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
       case 0x49:
 	  if ((family == 15) && (model == 6))
 	      /* On Xeon MP (family F, model 6), this is for L3 */
@@ -163,12 +153,7 @@
 	  else
 	      *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
 	  break;
-      case 0x4e:
-         /* Real L2 cache configuration is:
-            *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */
-         VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb");
-         *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
-         break;
+      case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
 
       /* These are sectored, whatever that means */
       case 0x60: *D1c = (cache_t) { 16, 8, 64 };  break;      /* sectored */
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index a30ba75..36ddbab 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -1158,18 +1158,12 @@
 static 
 void check_cache(cache_t* cache, Char *name)
 {
-   /* First check they're all powers of two */
-   if (-1 == VG_(log2)(cache->size)) {
+   /* Simulator requires line size and set count to be powers of two */
+   if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
       VG_(message)(Vg_UserMsg,
-         "error: %s size of %dB not a power of two; aborting.",
-         name, cache->size);
-      VG_(exit)(1);
-   }
-
-   if (-1 == VG_(log2)(cache->assoc)) {
-      VG_(message)(Vg_UserMsg,
-         "error: %s associativity of %d not a power of two; aborting.",
-         name, cache->assoc);
+         "error: %s set count not a power of two; aborting.",
+         name);
       VG_(exit)(1);
    }
 
diff --git a/cachegrind/cg_sim.c b/cachegrind/cg_sim.c
index 6edf126..25bef49 100644
--- a/cachegrind/cg_sim.c
+++ b/cachegrind/cg_sim.c
@@ -44,7 +44,6 @@
    Int          line_size;              /* bytes */
    Int          sets;
    Int          sets_min_1;
-   Int          assoc_bits;
    Int          line_size_bits;
    Int          tag_shift;
    Char         desc_line[128];
@@ -62,7 +61,6 @@
 
    c->sets           = (c->size / c->line_size) / c->assoc;
    c->sets_min_1     = c->sets - 1;
-   c->assoc_bits     = VG_(log2)(c->assoc);
    c->line_size_bits = VG_(log2)(c->line_size);
    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
 
@@ -111,8 +109,7 @@
    /* First case: word entirely within line. */                             \
    if (set1 == set2) {                                                      \
                                                                             \
-      /* Shifting is a bit faster than multiplying */                       \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
                                                                             \
       /* This loop is unrolled for just the first case, which is the most */\
       /* common.  We can't unroll any further because it would screw up   */\
@@ -143,7 +140,7 @@
    /* Second case: word straddles two lines. */                             \
    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       if (tag == set[0]) {                                                  \
          goto block2;                                                       \
       }                                                                     \
@@ -162,7 +159,7 @@
       set[0] = tag;                                                         \
       is_miss = True;                                                       \
 block2:                                                                     \
-      set = &(L.tags[set2 << L.assoc_bits]);                                \
+      set = &(L.tags[set2 * L.assoc]);                                      \
       tag2 = (a+size-1) >> L.tag_shift;                                     \
       if (tag2 == set[0]) {                                                 \
          goto miss_treatment;                                               \
diff --git a/cachegrind/docs/cg-manual.xml b/cachegrind/docs/cg-manual.xml
index f65272b..512eeb4 100644
--- a/cachegrind/docs/cg-manual.xml
+++ b/cachegrind/docs/cg-manual.xml
@@ -142,7 +142,7 @@
   </listitem>
 
   <listitem>
-    <para>Bit-selection hash function: the line(s) in the cache
+    <para>Bit-selection hash function: the set of line(s) in the cache
     to which a memory block maps is chosen by the middle bits
     M--(M+N-1) of the byte address, where:</para>
     <itemizedlist>
@@ -150,15 +150,17 @@
         <para>line size = 2^M bytes</para>
       </listitem>
       <listitem>
-        <para>(cache size / line size) = 2^N bytes</para>
+        <para>(cache size / line size / associativity) = 2^N bytes</para>
       </listitem>
     </itemizedlist> 
   </listitem>
 
   <listitem>
-    <para>Inclusive L2 cache: the L2 cache replicates all the
-    entries of the L1 cache.  This is standard on Pentium chips,
-    but AMD Opterons, Athlons and Durons 
+    <para>Inclusive L2 cache: the L2 cache typically replicates all
+    the entries of the L1 caches, because fetching into L1 involves
+    fetching into L2 first (this does not guarantee strict inclusiveness,
+    as lines evicted from L2 still could reside in L1).  This is
+    standard on Pentium chips, but AMD Opterons, Athlons and Durons
     use an exclusive L2 cache that only holds
     blocks evicted from L1.  Ditto most modern VIA CPUs.</para>
   </listitem>
@@ -176,7 +178,10 @@
 (I1/D1/L2) of the cache from the command line using the
 <computeroutput>--I1</computeroutput>,
 <computeroutput>--D1</computeroutput> and
-<computeroutput>--L2</computeroutput> options.</para>
+<computeroutput>--L2</computeroutput> options.
+For cache parameters to be valid for simulation, the number
+of sets (with associativity being the number of cache lines in
+each set) has to be a power of two.</para>
 
 <para>On PowerPC platforms
 Cachegrind cannot automatically 
@@ -227,10 +232,7 @@
 <para>If you are interested in simulating a cache with different
 properties, it is not particularly hard to write your own cache
 simulator, or to modify the existing ones in
-<computeroutput>vg_cachesim_I1.c</computeroutput>,
-<computeroutput>vg_cachesim_D1.c</computeroutput>,
-<computeroutput>vg_cachesim_L2.c</computeroutput> and
-<computeroutput>vg_cachesim_gen.c</computeroutput>.  We'd be
+<computeroutput>cg_sim.c</computeroutput>. We'd be
 interested to hear from anyone who does.</para>
 
 </sect2>
diff --git a/cachegrind/tests/Makefile.am b/cachegrind/tests/Makefile.am
index 0ffc4da..8e568cc 100644
--- a/cachegrind/tests/Makefile.am
+++ b/cachegrind/tests/Makefile.am
@@ -15,6 +15,7 @@
 	chdir.vgtest chdir.stderr.exp \
 	clreq.vgtest clreq.stderr.exp \
 	dlclose.vgtest dlclose.stderr.exp dlclose.stdout.exp \
+	notpower2.vgtest notpower2.stderr.exp \
 	wrap5.vgtest wrap5.stderr.exp wrap5.stdout.exp
 
 check_PROGRAMS = \
diff --git a/cachegrind/tests/filter_stderr b/cachegrind/tests/filter_stderr
index 43efa2d..a5bc1f4 100755
--- a/cachegrind/tests/filter_stderr
+++ b/cachegrind/tests/filter_stderr
@@ -17,5 +17,4 @@
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
 sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
 sed "/warning: L3 cache detected but ignored/d" |
-sed "/warning: 6Mb L2 cache detected, treating as 4Mb/d" |
 sed "/Warning: Cannot auto-detect cache config on PPC.., using one or more defaults/d"
diff --git a/cachegrind/tests/notpower2.stderr.exp b/cachegrind/tests/notpower2.stderr.exp
new file mode 100644
index 0000000..8eaf654
--- /dev/null
+++ b/cachegrind/tests/notpower2.stderr.exp
@@ -0,0 +1,17 @@
+
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/cachegrind/tests/notpower2.vgtest b/cachegrind/tests/notpower2.vgtest
new file mode 100644
index 0000000..132cfe5
--- /dev/null
+++ b/cachegrind/tests/notpower2.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+cleanup: rm cachegrind.out.*
diff --git a/callgrind/sim.c b/callgrind/sim.c
index 506ed9e..29f72bd 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -74,7 +74,6 @@
    Bool         sectored;  /* prefetch nearside cacheline on read */
    int          sets;
    int          sets_min_1;
-   int          assoc_bits;
    int          line_size_bits;
    int          tag_shift;
    UWord        tag_mask;
@@ -195,7 +194,6 @@
 
    c->sets           = (c->size / c->line_size) / c->assoc;
    c->sets_min_1     = c->sets - 1;
-   c->assoc_bits     = VG_(log2)(c->assoc);
    c->line_size_bits = VG_(log2)(c->line_size);
    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
    c->tag_mask       = ~((1<<c->tag_shift)-1);
@@ -259,8 +257,7 @@
     int i, j;
     UWord *set;
 
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
+    set = &(c->tags[set_no * c->assoc]);
 
     /* This loop is unrolled for just the first case, which is the most */
     /* common.  We can't unroll any further because it would screw up   */
@@ -359,8 +356,7 @@
     int i, j;
     UWord *set, tmp_tag;
 
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
+    set = &(c->tags[set_no * c->assoc]);
 
     /* This loop is unrolled for just the first case, which is the most */
     /* common.  We can't unroll any further because it would screw up   */
@@ -407,7 +403,7 @@
     /* Access straddles two lines. */
     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
     else if (((set1 + 1) & (c->sets-1)) == set2) {
-	UWord tag2  = (a+size-1) >> c->tag_shift;
+	UWord tag2  = (a+size-1) & c->tag_mask;
 
 	/* the call updates cache structures as side effect */
 	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
@@ -676,7 +672,7 @@
     /* We use lower tag bits as offset pointers to cache use info.
      * I.e. some cache parameters don't work.
      */
-    if (c->tag_shift < c->assoc_bits) {
+    if ( (1<<c->tag_shift) < c->assoc) {
 	VG_(message)(Vg_DebugMsg,
 		     "error: Use associativity < %d for cache use statistics!",
 		     (1<<c->tag_shift) );
@@ -690,7 +686,7 @@
 static __inline__
 void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
 {
-    int idx = (high_idx << c->assoc_bits) | low_idx;
+    int idx = (high_idx * c->assoc) + low_idx;
 
     c->use[idx].count ++;
     c->use[idx].mask |= use_mask;
@@ -709,8 +705,7 @@
     UWord *set, tmp_tag;
     UInt use_mask;
 
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
+    set = &(c->tags[set_no * c->assoc]);
     use_mask =
 	c->line_start_mask[a & c->line_size_mask] &
 	c->line_end_mask[(a+size-1) & c->line_size_mask];
@@ -745,7 +740,7 @@
     }
     set[0] = tag | tmp_tag;
 
-    cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
+    cacheuse_L2_miss(c, (set_no * c->assoc) | tmp_tag,
 		     use_mask, a & ~c->line_size_mask);
 
     return Miss;
@@ -756,7 +751,7 @@
 {
     UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
     UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
-    UWord tag  = a >> c->tag_shift;
+    UWord tag  = a & c->tag_mask;
 
     /* Access entirely within line. */
     if (set1 == set2) 
@@ -765,7 +760,7 @@
     /* Access straddles two lines. */
     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
     else if (((set1 + 1) & (c->sets-1)) == set2) {
-	UWord tag2  = a >> c->tag_shift;
+	UWord tag2  = a & c->tag_mask;
 
 	/* the call updates cache structures as side effect */
 	CacheResult res1 =  cacheuse_isMiss(c, set1, tag);
@@ -800,8 +795,7 @@
    /* First case: word entirely within line. */                             \
    if (set1 == set2) {                                                      \
                                                                             \
-      /* Shifting is a bit faster than multiplying */                       \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
 	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
                                                                             \
@@ -809,7 +803,7 @@
       /* common.  We can't unroll any further because it would screw up   */\
       /* if we have a direct-mapped (1-way) cache.                        */\
       if (tag == (set[0] & L.tag_mask)) {                                   \
-        idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);              \
+        idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
         L.use[idx].count ++;                                                \
         L.use[idx].mask |= use_mask;                                        \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -826,7 +820,7 @@
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;			                            \
-            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -842,7 +836,7 @@
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag | tmp_tag;                                               \
-      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set1 * L.assoc) + tmp_tag;                                     \
       return update_##L##_use(&L, idx,         			            \
 		       use_mask, a &~ L.line_size_mask);		    \
                                                                             \
@@ -850,10 +844,10 @@
    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       use_mask = L.line_start_mask[a & L.line_size_mask];		    \
       if (tag == (set[0] & L.tag_mask)) {                                   \
-         idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
+         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
          L.use[idx].count ++;                                               \
          L.use[idx].mask |= use_mask;                                       \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -868,7 +862,7 @@
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;                                               \
-            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -882,15 +876,15 @@
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag | tmp_tag;                                               \
-      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set1 * L.assoc) + tmp_tag;                                     \
       miss1 = update_##L##_use(&L, idx,        			            \
 		       use_mask, a &~ L.line_size_mask);		    \
 block2:                                                                     \
-      set = &(L.tags[set2 << L.assoc_bits]);                                \
+      set = &(L.tags[set2 * L.assoc]);                                      \
       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
       tag2  = (a+size-1) & L.tag_mask;                                      \
       if (tag2 == (set[0] & L.tag_mask)) {                                  \
-         idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
+         idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
          L.use[idx].count ++;                                               \
          L.use[idx].mask |= use_mask;                                       \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -905,7 +899,7 @@
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;                                               \
-            idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -919,7 +913,7 @@
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag2 | tmp_tag;                                              \
-      idx = (set2 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set2 * L.assoc) + tmp_tag;                                     \
       miss2 = update_##L##_use(&L, idx,			                    \
 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit;     \
@@ -984,7 +978,7 @@
 CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
 {
    UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
-   UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
+   UWord* set = &(L2.tags[setNo * L2.assoc]);
    UWord tag  = memline & L2.tag_mask;
 
    int i, j, idx;
@@ -993,7 +987,7 @@
    CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
 
    if (tag == (set[0] & L2.tag_mask)) {
-     idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
+     idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
      l1_loaded->dep_use = &(L2.use[idx]);
 
      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
@@ -1008,7 +1002,7 @@
 	 set[j] = set[j - 1];
        }
        set[0] = tmp_tag;
-       idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
+       idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
        l1_loaded->dep_use = &(L2.use[idx]);
 
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
@@ -1024,7 +1018,7 @@
      set[j] = set[j - 1];
    }
    set[0] = tag | tmp_tag;
-   idx = (setNo << L2.assoc_bits) | tmp_tag;
+   idx = (setNo * L2.assoc) + tmp_tag;
    l1_loaded->dep_use = &(L2.use[idx]);
 
    update_L2_use(idx, memline);
@@ -1380,22 +1374,15 @@
 static
 void check_cache(cache_t* cache, Char *name)
 {
-   /* First check they're all powers of two */
-   if (-1 == VG_(log2)(cache->size)) {
+   /* Simulator requires line size and set count to be powers of two */
+   if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
       VG_(message)(Vg_UserMsg,
-         "error: %s size of %dB not a power of two; aborting.",
-         name, cache->size);
-      VG_(exit)(1);
+         "error: %s set count not a power of two; aborting.",
+         name);
    }
 
-   if (-1 == VG_(log2)(cache->assoc)) {
-      VG_(message)(Vg_UserMsg,
-         "error: %s associativity of %d not a power of two; aborting.",
-         name, cache->assoc);
-      VG_(exit)(1);
-   }
-
-  if (-1 == VG_(log2)(cache->line_size)) {
+   if (-1 == VG_(log2)(cache->line_size)) {
       VG_(message)(Vg_UserMsg,
          "error: %s line size of %dB not a power of two; aborting.",
          name, cache->line_size);
diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am
index b95cfc3..1c3f42c 100644
--- a/callgrind/tests/Makefile.am
+++ b/callgrind/tests/Makefile.am
@@ -11,6 +11,10 @@
              simwork1.vgtest simwork1.stdout.exp simwork1.stderr.exp \
              simwork2.vgtest simwork2.stdout.exp simwork2.stderr.exp \
              simwork3.vgtest simwork3.stdout.exp simwork3.stderr.exp \
+             notpower2.vgtest notpower2.stderr.exp \
+             notpower2-wb.vgtest notpower2-wb.stderr.exp \
+             notpower2-hwpref.vgtest notpower2-hwpref.stderr.exp \
+             notpower2-use.vgtest notpower2-use.stderr.exp \
              threads.vgtest threads.stderr.exp
 
 check_PROGRAMS = clreq simwork threads
diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr
index 1023bc6..7b69674 100755
--- a/callgrind/tests/filter_stderr
+++ b/callgrind/tests/filter_stderr
@@ -23,5 +23,4 @@
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
 sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
 sed "/warning: L3 cache detected but ignored/d" |
-sed "/warning: 6Mb L2 cache detected, treating as 4Mb/d" |
 sed "/Warning: Cannot auto-detect cache config on PPC.., using one or more defaults/d"
diff --git a/callgrind/tests/notpower2-hwpref.stderr.exp b/callgrind/tests/notpower2-hwpref.stderr.exp
new file mode 100644
index 0000000..0705c1c
--- /dev/null
+++ b/callgrind/tests/notpower2-hwpref.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-hwpref.vgtest b/callgrind/tests/notpower2-hwpref.vgtest
new file mode 100644
index 0000000..9da7dce
--- /dev/null
+++ b/callgrind/tests/notpower2-hwpref.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-use.stderr.exp b/callgrind/tests/notpower2-use.stderr.exp
new file mode 100644
index 0000000..ea9acc8
--- /dev/null
+++ b/callgrind/tests/notpower2-use.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-use.vgtest b/callgrind/tests/notpower2-use.vgtest
new file mode 100644
index 0000000..b8312a7
--- /dev/null
+++ b/callgrind/tests/notpower2-use.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-wb.stderr.exp b/callgrind/tests/notpower2-wb.stderr.exp
new file mode 100644
index 0000000..90da3e4
--- /dev/null
+++ b/callgrind/tests/notpower2-wb.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-wb.vgtest b/callgrind/tests/notpower2-wb.vgtest
new file mode 100644
index 0000000..34a1f6b
--- /dev/null
+++ b/callgrind/tests/notpower2-wb.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2.stderr.exp b/callgrind/tests/notpower2.stderr.exp
new file mode 100644
index 0000000..0705c1c
--- /dev/null
+++ b/callgrind/tests/notpower2.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2.vgtest b/callgrind/tests/notpower2.vgtest
new file mode 100644
index 0000000..73823d7
--- /dev/null
+++ b/callgrind/tests/notpower2.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+cleanup: rm callgrind.out.*