Change Cachegrind/Callgrind to talk about the LL (last-level) cache instead
of the L2 cache.  This is to accommodate machines with three levels of
cache.  We still only simulate two levels, the first and the last.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11404 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/NEWS b/NEWS
index ead2949..928ee02 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,20 @@
   --threshold option has changed; this is unlikely to affect many people, if
   you do use it please see the user manual for details.
 
+- Callgrind now can do branch prediction simulation, similar to Cachegrind.
+  In addition, it optionally can count the number of executed global bus events.
+  Both can be used for a better approximation of a "Cycle Estimation" as
+  derived event (you need to update the event formula in KCachegrind yourself).
+
+- Cachegrind and Callgrind now refer to the LL (last-level) cache rather
+  than the L2 cache.  This is to accommodate machines with three levels of
+  caches -- if Cachegrind/Callgrind auto-detects the cache configuration of
+  such a machine it will run the simulation as if the L2 cache isn't
+  present.  This means the results are less likely to match the true result
+  for the machine, but Cachegrind/Callgrind's results are already only
+  approximate, and should not be considered authoritative.  The results are
+  still useful for giving a general idea about a program's locality.
+
 - Massif has a new option, --pages-as-heap, which is disabled by default.
   When enabled, instead of tracking allocations at the level of heap blocks
   (as allocated with malloc/new/new[]), it instead tracks memory allocations
@@ -24,11 +38,6 @@
   harder than the heap-level output, but this option is useful if you want
   to account for every byte of memory used by a program.
 
-- Callgrind now can do branch prediction simulation, similar to Cachegrind.
-  In addition, it optionally can count the number of executed global bus events.
-  Both can be used for a better approximation of a "Cycle Estimation" as
-  derived event (you need to update the event formula in KCachegrind yourself).
-
 - Added new memcheck command-line option --show-possibly-lost.
 
 
diff --git a/cachegrind/cg-arm.c b/cachegrind/cg-arm.c
index 27f5d0d..e37d0c0 100644
--- a/cachegrind/cg-arm.c
+++ b/cachegrind/cg-arm.c
@@ -37,13 +37,13 @@
 
 #include "cg_arch.h"
 
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                            Bool all_caches_clo_defined)
 {
    // Set caches to default (for Cortex-A8 ?)
    *I1c = (cache_t) {  16384, 4, 64 };
    *D1c = (cache_t) {  16384, 4, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };
 
    if (!all_caches_clo_defined) {
       VG_(message)(Vg_DebugMsg, 
diff --git a/cachegrind/cg-ppc32.c b/cachegrind/cg-ppc32.c
index ce39c2e..5920c05 100644
--- a/cachegrind/cg-ppc32.c
+++ b/cachegrind/cg-ppc32.c
@@ -37,13 +37,13 @@
 
 #include "cg_arch.h"
 
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                            Bool all_caches_clo_defined)
 {
    // Set caches to default.
    *I1c = (cache_t) {  65536, 2, 64 };
    *D1c = (cache_t) {  65536, 2, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };
 
    // Warn if config not completely specified from cmd line.  Note that
    // this message is slightly different from the one we give on x86/AMD64
diff --git a/cachegrind/cg-ppc64.c b/cachegrind/cg-ppc64.c
index 82993f8..973664b 100644
--- a/cachegrind/cg-ppc64.c
+++ b/cachegrind/cg-ppc64.c
@@ -37,13 +37,13 @@
 
 #include "cg_arch.h"
 
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                            Bool all_caches_clo_defined)
 {
    // Set caches to default.
    *I1c = (cache_t) {  65536, 2, 64 };
    *D1c = (cache_t) {  65536, 2, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };
 
    // Warn if config not completely specified from cmd line.  Note that
    // this message is slightly different from the one we give on x86/AMD64
diff --git a/cachegrind/cg-x86-amd64.c b/cachegrind/cg-x86-amd64.c
index 350606b..6794319 100644
--- a/cachegrind/cg-x86-amd64.c
+++ b/cachegrind/cg-x86-amd64.c
@@ -54,9 +54,12 @@
  * array of pre-defined configurations for various parts of the memory
  * hierarchy.
  * According to Intel Processor Identification, App Note 485.
+ * 
+ * If a L3 cache is found, then data for it rather than the L2
+ * is returned via *LLc.
  */
 static
-Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
    Int cpuid1_eax;
    Int cpuid1_ignore;
@@ -65,6 +68,14 @@
    UChar info[16];
    Int   i, trials;
    Bool  L2_found = False;
+   /* If we see L3 cache info, copy it into L3c.  Then, at the end,
+      copy it into *LLc.  Hence if a L3 cache is specified, *LLc will
+      eventually contain a description of it rather than the L2 cache.
+      The use of the L3c intermediary makes this process independent
+      of the order in which the cache specifications appear in
+      info[]. */
+   Bool  L3_found = False;
+   cache_t L3c = { 0, 0, 0 };
 
    if (level < 2) {
       VG_(dmsg)("warning: CPUID level < 2 for Intel processor (%d)\n", level);
@@ -121,18 +132,39 @@
       case 0x90: case 0x96: case 0x9b:
          VG_(tool_panic)("IA-64 cache detected?!");
 
-      case 0x22: case 0x23: case 0x25: case 0x29:
-      case 0x46: case 0x47: case 0x4a: case 0x4b: case 0x4c: case 0x4d:
-      case 0xe2: case 0xe3: case 0xe4: case 0xea: case 0xeb: case 0xec:
-          VG_(dmsg)("warning: L3 cache detected but ignored\n");
-          break;
+      /* L3 cache info. */
+      case 0x22: L3c = (cache_t) { 512,    4, 64 }; L3_found = True; break;
+      case 0x23: L3c = (cache_t) { 1024,   8, 64 }; L3_found = True; break;
+      case 0x25: L3c = (cache_t) { 2048,   8, 64 }; L3_found = True; break;
+      case 0x29: L3c = (cache_t) { 4096,   8, 64 }; L3_found = True; break;
+      case 0x46: L3c = (cache_t) { 4096,   4, 64 }; L3_found = True; break;
+      case 0x47: L3c = (cache_t) { 8192,   8, 64 }; L3_found = True; break;
+      case 0x4a: L3c = (cache_t) { 6144,  12, 64 }; L3_found = True; break;
+      case 0x4b: L3c = (cache_t) { 8192,  16, 64 }; L3_found = True; break;
+      case 0x4c: L3c = (cache_t) { 12288, 12, 64 }; L3_found = True; break;
+      case 0x4d: L3c = (cache_t) { 16384, 16, 64 }; L3_found = True; break;
+      case 0xd0: L3c = (cache_t) { 512,    4, 64 }; L3_found = True; break;
+      case 0xd1: L3c = (cache_t) { 1024,   4, 64 }; L3_found = True; break;
+      case 0xd2: L3c = (cache_t) { 2048,   4, 64 }; L3_found = True; break;
+      case 0xd6: L3c = (cache_t) { 1024,   8, 64 }; L3_found = True; break;
+      case 0xd7: L3c = (cache_t) { 2048,   8, 64 }; L3_found = True; break;
+      case 0xd8: L3c = (cache_t) { 4096,   8, 64 }; L3_found = True; break;
+      case 0xdc: L3c = (cache_t) { 1536,  12, 64 }; L3_found = True; break;
+      case 0xdd: L3c = (cache_t) { 3072,  12, 64 }; L3_found = True; break;
+      case 0xde: L3c = (cache_t) { 6144,  12, 64 }; L3_found = True; break;
+      case 0xe2: L3c = (cache_t) { 2048,  16, 64 }; L3_found = True; break;
+      case 0xe3: L3c = (cache_t) { 4096,  16, 64 }; L3_found = True; break;
+      case 0xe4: L3c = (cache_t) { 8192,  16, 64 }; L3_found = True; break;
+      case 0xea: L3c = (cache_t) { 12288, 24, 64 }; L3_found = True; break;
+      case 0xeb: L3c = (cache_t) { 18432, 24, 64 }; L3_found = True; break;
+      case 0xec: L3c = (cache_t) { 24576, 24, 64 }; L3_found = True; break;
 
       /* Described as "MLC" in Intel documentation */
-      case 0x21: *L2c = (cache_t) {  256, 8, 64 }; L2_found = True; break;
+      case 0x21: *LLc = (cache_t) {  256, 8, 64 }; L2_found = True; break;
 
       /* These are sectored, whatever that means */
-      case 0x39: *L2c = (cache_t) {  128, 4, 64 }; L2_found = True; break;
-      case 0x3c: *L2c = (cache_t) {  256, 4, 64 }; L2_found = True; break;
+      case 0x39: *LLc = (cache_t) {  128, 4, 64 }; L2_found = True; break;
+      case 0x3c: *LLc = (cache_t) {  256, 4, 64 }; L2_found = True; break;
 
       /* If a P6 core, this means "no L2 cache".  
          If a P4 core, this means "no L3 cache".
@@ -141,20 +173,21 @@
       case 0x40:
           break;
 
-      case 0x41: *L2c = (cache_t) {  128, 4, 32 }; L2_found = True; break;
-      case 0x42: *L2c = (cache_t) {  256, 4, 32 }; L2_found = True; break;
-      case 0x43: *L2c = (cache_t) {  512, 4, 32 }; L2_found = True; break;
-      case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
-      case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
-      case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
+      case 0x41: *LLc = (cache_t) {  128,  4, 32 }; L2_found = True; break;
+      case 0x42: *LLc = (cache_t) {  256,  4, 32 }; L2_found = True; break;
+      case 0x43: *LLc = (cache_t) {  512,  4, 32 }; L2_found = True; break;
+      case 0x44: *LLc = (cache_t) { 1024,  4, 32 }; L2_found = True; break;
+      case 0x45: *LLc = (cache_t) { 2048,  4, 32 }; L2_found = True; break;
+      case 0x48: *LLc = (cache_t) { 3072, 12, 64 }; L2_found = True; break;
+      case 0x4e: *LLc = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
       case 0x49:
-	  if ((family == 15) && (model == 6))
-	      /* On Xeon MP (family F, model 6), this is for L3 */
-	      VG_(dmsg)("warning: L3 cache detected but ignored\n");
-	  else
-	      *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
-	  break;
-      case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
+         if (family == 15 && model == 6) {
+            /* On Xeon MP (family F, model 6), this is for L3 */
+            L3c = (cache_t) { 4096, 16, 64 }; L3_found = True;
+         } else {
+	    *LLc = (cache_t) { 4096, 16, 64 }; L2_found = True;
+         }
+         break;
 
       /* These are sectored, whatever that means */
       case 0x60: *D1c = (cache_t) { 16, 8, 64 };  break;      /* sectored */
@@ -181,26 +214,24 @@
          break;  
 
       /* not sectored, whatever that might mean */
-      case 0x78: *L2c = (cache_t) { 1024, 4,  64 }; L2_found = True;  break;
+      case 0x78: *LLc = (cache_t) { 1024, 4,  64 }; L2_found = True;  break;
 
       /* These are sectored, whatever that means */
-      case 0x79: *L2c = (cache_t) {  128, 8,  64 }; L2_found = True;  break;
-      case 0x7a: *L2c = (cache_t) {  256, 8,  64 }; L2_found = True;  break;
-      case 0x7b: *L2c = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
-      case 0x7c: *L2c = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
-      case 0x7d: *L2c = (cache_t) { 2048, 8,  64 }; L2_found = True;  break;
-      case 0x7e: *L2c = (cache_t) {  256, 8, 128 }; L2_found = True;  break;
-
-      case 0x7f: *L2c = (cache_t) {  512, 2, 64 };  L2_found = True;  break;
-      case 0x80: *L2c = (cache_t) {  512, 8, 64 };  L2_found = True;  break;
-
-      case 0x81: *L2c = (cache_t) {  128, 8, 32 };  L2_found = True;  break;
-      case 0x82: *L2c = (cache_t) {  256, 8, 32 };  L2_found = True;  break;
-      case 0x83: *L2c = (cache_t) {  512, 8, 32 };  L2_found = True;  break;
-      case 0x84: *L2c = (cache_t) { 1024, 8, 32 };  L2_found = True;  break;
-      case 0x85: *L2c = (cache_t) { 2048, 8, 32 };  L2_found = True;  break;
-      case 0x86: *L2c = (cache_t) {  512, 4, 64 };  L2_found = True;  break;
-      case 0x87: *L2c = (cache_t) { 1024, 8, 64 };  L2_found = True;  break;
+      case 0x79: *LLc = (cache_t) {  128, 8,  64 }; L2_found = True;  break;
+      case 0x7a: *LLc = (cache_t) {  256, 8,  64 }; L2_found = True;  break;
+      case 0x7b: *LLc = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
+      case 0x7c: *LLc = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
+      case 0x7d: *LLc = (cache_t) { 2048, 8,  64 }; L2_found = True;  break;
+      case 0x7e: *LLc = (cache_t) {  256, 8, 128 }; L2_found = True;  break;
+      case 0x7f: *LLc = (cache_t) {  512, 2,  64 }; L2_found = True;  break;
+      case 0x80: *LLc = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
+      case 0x81: *LLc = (cache_t) {  128, 8,  32 }; L2_found = True;  break;
+      case 0x82: *LLc = (cache_t) {  256, 8,  32 }; L2_found = True;  break;
+      case 0x83: *LLc = (cache_t) {  512, 8,  32 }; L2_found = True;  break;
+      case 0x84: *LLc = (cache_t) { 1024, 8,  32 }; L2_found = True;  break;
+      case 0x85: *LLc = (cache_t) { 2048, 8,  32 }; L2_found = True;  break;
+      case 0x86: *LLc = (cache_t) {  512, 4,  64 }; L2_found = True;  break;
+      case 0x87: *LLc = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
 
       /* Ignore prefetch information */
       case 0xf0: case 0xf1:
@@ -213,8 +244,15 @@
       }
    }
 
+   /* If we found a L3 cache, throw away the L2 data and use the L3's instead. */
+   if (L3_found) {
+      VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n");
+      *LLc = L3c;
+      L2_found = True;
+   }
+
    if (!L2_found)
-      VG_(dmsg)("warning: L2 cache not installed, ignore L2 results.\n");
+      VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
 
    return 0;
 }
@@ -241,14 +279,37 @@
  * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
  * so we detect that.
  * 
- * Returns 0 on success, non-zero on failure.
+ * Returns 0 on success, non-zero on failure.  As with the Intel code
+ * above, if a L3 cache is found, then data for it rather than the L2
+ * is returned via *LLc.
  */
+
+/* A small helper */
+static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
+{
+   /* Decode a L2/L3 associativity indication.  It is encoded
+      differently from the I1/D1 associativity.  Returns 1
+      (direct-map) as a safe but suboptimal result for unknown
+      encodings. */
+   switch (bits_15_12 & 0xF) {
+      case 1: return 1;    case 2: return 2;
+      case 4: return 4;    case 6: return 8;
+      case 8: return 16;   case 0xA: return 32;
+      case 0xB: return 48; case 0xC: return 64;
+      case 0xD: return 96; case 0xE: return 128;
+      case 0xF: /* fully associative */
+      case 0: /* L2/L3 cache or TLB is disabled */
+      default:
+        return 1;
+   }
+}
+
 static
-Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
    UInt ext_level;
    UInt dummy, model;
-   UInt I1i, D1i, L2i;
+   UInt I1i, D1i, L2i, L3i;
    
    VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy);
 
@@ -259,7 +320,7 @@
    }
 
    VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i);
-   VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &dummy);
+   VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &L3i);
 
    VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy);
 
@@ -277,15 +338,26 @@
    I1c->assoc     = (I1i >> 16) & 0xff;
    I1c->line_size = (I1i >>  0) & 0xff;
 
-   L2c->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
-   L2c->assoc     = (L2i >> 12) & 0xf;
-   L2c->line_size = (L2i >>  0) & 0xff;
+   LLc->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
+   LLc->assoc     = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf);
+   LLc->line_size = (L2i >>  0) & 0xff;
+
+   if (((L3i >> 18) & 0x3fff) > 0) {
+      /* There's an L3 cache.  Replace *LLc contents with this info. */
+      /* NB: the test in the if is "if L3 size > 0 ".  I don't know if
+         this is the right way to test presence-vs-absence of L3.  I
+         can't see any guidance on this in the AMD documentation. */
+      LLc->size      = ((L3i >> 18) & 0x3fff) * 512;
+      LLc->assoc     = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf);
+      LLc->line_size = (L3i >>  0) & 0xff;
+      VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n");
+   }
 
    return 0;
 }
 
 static 
-Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
    Int  level, ret;
    Char vendor_id[13];
@@ -306,10 +378,10 @@
 
    /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
    if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
-      ret = Intel_cache_info(level, I1c, D1c, L2c);
+      ret = Intel_cache_info(level, I1c, D1c, LLc);
 
    } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
-      ret = AMD_cache_info(I1c, D1c, L2c);
+      ret = AMD_cache_info(I1c, D1c, LLc);
 
    } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
       /* Total kludge.  Pretend to be a VIA Nehemiah. */
@@ -319,9 +391,9 @@
       I1c->size      = 64;
       I1c->assoc     = 4;
       I1c->line_size = 16;
-      L2c->size      = 64;
-      L2c->assoc     = 16;
-      L2c->line_size = 16;
+      LLc->size      = 64;
+      LLc->assoc     = 16;
+      LLc->line_size = 16;
       ret = 0;
 
    } else {
@@ -332,13 +404,13 @@
    /* Successful!  Convert sizes from KB to bytes */
    I1c->size *= 1024;
    D1c->size *= 1024;
-   L2c->size *= 1024;
+   LLc->size *= 1024;
       
    return ret;
 }
 
 
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                            Bool all_caches_clo_defined)
 {
    Int res;
@@ -346,10 +418,10 @@
    // Set caches to default.
    *I1c = (cache_t) {  65536, 2, 64 };
    *D1c = (cache_t) {  65536, 2, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };
 
    // Then replace with any info we can get from CPUID.
-   res = get_caches_from_CPUID(I1c, D1c, L2c);
+   res = get_caches_from_CPUID(I1c, D1c, LLc);
 
    // Warn if CPUID failed and config not completely specified from cmd line.
    if (res != 0 && !all_caches_clo_defined) {
diff --git a/cachegrind/cg_arch.h b/cachegrind/cg_arch.h
index 7a8d171..23f1a2c 100644
--- a/cachegrind/cg_arch.h
+++ b/cachegrind/cg_arch.h
@@ -33,14 +33,14 @@
 
 // For cache simulation
 typedef struct {
-   int size;       // bytes
-   int assoc;
-   int line_size;  // bytes
+   Int size;       // bytes
+   Int assoc;
+   Int line_size;  // bytes
 } cache_t;
 
-// Gives the configuration of I1, D1 and L2 caches.  They get overridden
+// Gives the configuration of I1, D1 and LL caches.  They get overridden
 // by any cache configurations specified on the command line.
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
                            Bool all_caches_clo_defined);
 
 #endif   // __CG_ARCH_H
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index 84341d4..ecdd706 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -77,7 +77,7 @@
    struct {
       ULong a;  /* total # memory accesses of this kind */
       ULong m1; /* misses in the first level cache */
-      ULong m2; /* misses in the second level cache */
+      ULong mL; /* misses in the second level cache */
    }
    CacheCC;
 
@@ -268,13 +268,13 @@
       lineCC->loc.line = loc.line;
       lineCC->Ir.a     = 0;
       lineCC->Ir.m1    = 0;
-      lineCC->Ir.m2    = 0;
+      lineCC->Ir.mL    = 0;
       lineCC->Dr.a     = 0;
       lineCC->Dr.m1    = 0;
-      lineCC->Dr.m2    = 0;
+      lineCC->Dr.mL    = 0;
       lineCC->Dw.a     = 0;
       lineCC->Dw.m1    = 0;
-      lineCC->Dw.m2    = 0;
+      lineCC->Dw.mL    = 0;
       lineCC->Bc.b     = 0;
       lineCC->Bc.mp    = 0;
       lineCC->Bi.b     = 0;
@@ -319,7 +319,7 @@
    //VG_(printf)("1I_0D :  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n",
    //             n, n->instr_addr, n->instr_len);
    cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
    n->parent->Ir.a++;
 }
 
@@ -331,10 +331,10 @@
    //            n,  n->instr_addr,  n->instr_len,
    //            n2, n2->instr_addr, n2->instr_len);
    cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
    n->parent->Ir.a++;
    cachesim_I1_doref(n2->instr_addr, n2->instr_len, 
-                     &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+                     &n2->parent->Ir.m1, &n2->parent->Ir.mL);
    n2->parent->Ir.a++;
 }
 
@@ -348,13 +348,13 @@
    //            n2, n2->instr_addr, n2->instr_len,
    //            n3, n3->instr_addr, n3->instr_len);
    cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
    n->parent->Ir.a++;
    cachesim_I1_doref(n2->instr_addr, n2->instr_len, 
-                     &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+                     &n2->parent->Ir.m1, &n2->parent->Ir.mL);
    n2->parent->Ir.a++;
    cachesim_I1_doref(n3->instr_addr, n3->instr_len, 
-                     &n3->parent->Ir.m1, &n3->parent->Ir.m2);
+                     &n3->parent->Ir.m1, &n3->parent->Ir.mL);
    n3->parent->Ir.a++;
 }
 
@@ -365,11 +365,11 @@
    //            "                               daddr=0x%010lx,  dsize=%lu\n",
    //            n, n->instr_addr, n->instr_len, data_addr, data_size);
    cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
    n->parent->Ir.a++;
 
    cachesim_D1_doref(data_addr, data_size, 
-                     &n->parent->Dr.m1, &n->parent->Dr.m2);
+                     &n->parent->Dr.m1, &n->parent->Dr.mL);
    n->parent->Dr.a++;
 }
 
@@ -380,11 +380,11 @@
    //            "                               daddr=0x%010lx,  dsize=%lu\n",
    //            n, n->instr_addr, n->instr_len, data_addr, data_size);
    cachesim_I1_doref(n->instr_addr, n->instr_len, 
-                     &n->parent->Ir.m1, &n->parent->Ir.m2);
+                     &n->parent->Ir.m1, &n->parent->Ir.mL);
    n->parent->Ir.a++;
 
    cachesim_D1_doref(data_addr, data_size, 
-                     &n->parent->Dw.m1, &n->parent->Dw.m2);
+                     &n->parent->Dw.m1, &n->parent->Dw.mL);
    n->parent->Dw.a++;
 }
 
@@ -394,7 +394,7 @@
    //VG_(printf)("0I_1Dr:  CCaddr=0x%010lx,  daddr=0x%010lx,  dsize=%lu\n",
    //            n, data_addr, data_size);
    cachesim_D1_doref(data_addr, data_size, 
-                     &n->parent->Dr.m1, &n->parent->Dr.m2);
+                     &n->parent->Dr.m1, &n->parent->Dr.mL);
    n->parent->Dr.a++;
 }
 
@@ -404,7 +404,7 @@
    //VG_(printf)("0I_1Dw:  CCaddr=0x%010lx,  daddr=0x%010lx,  dsize=%lu\n",
    //            n, data_addr, data_size);
    cachesim_D1_doref(data_addr, data_size, 
-                     &n->parent->Dw.m1, &n->parent->Dw.m2);
+                     &n->parent->Dw.m1, &n->parent->Dw.mL);
    n->parent->Dw.a++;
 }
 
@@ -1234,7 +1234,7 @@
 
 static cache_t clo_I1_cache = UNDEFINED_CACHE;
 static cache_t clo_D1_cache = UNDEFINED_CACHE;
-static cache_t clo_L2_cache = UNDEFINED_CACHE;
+static cache_t clo_LL_cache = UNDEFINED_CACHE;
 
 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
 // string otherwise.
@@ -1273,7 +1273,7 @@
 }
 
 static 
-void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
 #define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)
 
@@ -1283,22 +1283,22 @@
    Bool all_caches_clo_defined =
       (DEFINED(clo_I1_cache) &&
        DEFINED(clo_D1_cache) &&
-       DEFINED(clo_L2_cache));
+       DEFINED(clo_LL_cache));
 
    // Set the cache config (using auto-detection, if supported by the
    // architecture).
-   VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
+   VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
 
    // Check the default/auto-detected values.
    checkRes = check_cache(I1c);  tl_assert(!checkRes);
    checkRes = check_cache(D1c);  tl_assert(!checkRes);
-   checkRes = check_cache(L2c);  tl_assert(!checkRes);
+   checkRes = check_cache(LLc);  tl_assert(!checkRes);
 
    // Then replace with any defined on the command line.  (Already checked in
    // parse_cache_opt().)
    if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
    if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
-   if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
+   if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
 
    if (VG_(clo_verbosity) >= 2) {
       VG_(umsg)("Cache configuration used:\n");
@@ -1306,8 +1306,8 @@
                 I1c->size, I1c->assoc, I1c->line_size);
       VG_(umsg)("  D1: %dB, %d-way, %dB lines\n",
                 D1c->size, D1c->assoc, D1c->line_size);
-      VG_(umsg)("  L2: %dB, %d-way, %dB lines\n",
-                L2c->size, L2c->assoc, L2c->line_size);
+      VG_(umsg)("  LL: %dB, %d-way, %dB lines\n",
+                LLc->size, LLc->assoc, LLc->line_size);
    }
 #undef CMD_LINE_DEFINED
 }
@@ -1354,12 +1354,12 @@
       VG_(free)(cachegrind_out_file);
    }
 
-   // "desc:" lines (giving I1/D1/L2 cache configuration).  The spaces after
+   // "desc:" lines (giving I1/D1/LL cache configuration).  The spaces after
    // the 2nd colon makes cg_annotate's output look nicer.
    VG_(sprintf)(buf, "desc: I1 cache:         %s\n"
                      "desc: D1 cache:         %s\n"
-                     "desc: L2 cache:         %s\n",
-                     I1.desc_line, D1.desc_line, L2.desc_line);
+                     "desc: LL cache:         %s\n",
+                     I1.desc_line, D1.desc_line, LL.desc_line);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
 
    // "cmd:" line
@@ -1379,11 +1379,11 @@
    }
    // "events:" line
    if (clo_cache_sim && clo_branch_sim) {
-      VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
+      VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
                                   "Bc Bcm Bi Bim\n");
    }
    else if (clo_cache_sim && !clo_branch_sim) {
-      VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
+      VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
                                   "\n");
    }
    else if (!clo_cache_sim && clo_branch_sim) {
@@ -1430,9 +1430,9 @@
                              " %llu %llu %llu"
                              " %llu %llu %llu %llu\n",
                             lineCC->loc.line,
-                            lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2, 
-                            lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
-                            lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2,
+                            lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL, 
+                            lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
+                            lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL,
                             lineCC->Bc.b, lineCC->Bc.mp, 
                             lineCC->Bi.b, lineCC->Bi.mp);
       }
@@ -1441,9 +1441,9 @@
                              " %llu %llu %llu"
                              " %llu %llu %llu\n",
                             lineCC->loc.line,
-                            lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2, 
-                            lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
-                            lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2);
+                            lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL, 
+                            lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
+                            lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL);
       }
       else if (!clo_cache_sim && clo_branch_sim) {
          VG_(sprintf)(buf, "%u %llu"
@@ -1464,13 +1464,13 @@
       // Update summary stats
       Ir_total.a  += lineCC->Ir.a;
       Ir_total.m1 += lineCC->Ir.m1;
-      Ir_total.m2 += lineCC->Ir.m2;
+      Ir_total.mL += lineCC->Ir.mL;
       Dr_total.a  += lineCC->Dr.a;
       Dr_total.m1 += lineCC->Dr.m1;
-      Dr_total.m2 += lineCC->Dr.m2;
+      Dr_total.mL += lineCC->Dr.mL;
       Dw_total.a  += lineCC->Dw.a;
       Dw_total.m1 += lineCC->Dw.m1;
-      Dw_total.m2 += lineCC->Dw.m2;
+      Dw_total.mL += lineCC->Dw.mL;
       Bc_total.b  += lineCC->Bc.b;
       Bc_total.mp += lineCC->Bc.mp;
       Bi_total.b  += lineCC->Bi.b;
@@ -1487,9 +1487,9 @@
                         " %llu %llu %llu"
                         " %llu %llu %llu"
                         " %llu %llu %llu %llu\n", 
-                        Ir_total.a, Ir_total.m1, Ir_total.m2,
-                        Dr_total.a, Dr_total.m1, Dr_total.m2,
-                        Dw_total.a, Dw_total.m1, Dw_total.m2,
+                        Ir_total.a, Ir_total.m1, Ir_total.mL,
+                        Dr_total.a, Dr_total.m1, Dr_total.mL,
+                        Dw_total.a, Dw_total.m1, Dw_total.mL,
                         Bc_total.b, Bc_total.mp, 
                         Bi_total.b, Bi_total.mp);
    }
@@ -1498,9 +1498,9 @@
                         " %llu %llu %llu"
                         " %llu %llu %llu"
                         " %llu %llu %llu\n",
-                        Ir_total.a, Ir_total.m1, Ir_total.m2,
-                        Dr_total.a, Dr_total.m1, Dr_total.m2,
-                        Dw_total.a, Dw_total.m1, Dw_total.m2);
+                        Ir_total.a, Ir_total.m1, Ir_total.mL,
+                        Dr_total.a, Dr_total.m1, Dr_total.mL,
+                        Dw_total.a, Dw_total.m1, Dw_total.mL);
    }
    else if (!clo_cache_sim && clo_branch_sim) {
       VG_(sprintf)(buf, "summary:"
@@ -1537,8 +1537,8 @@
 
    CacheCC  D_total;
    BranchCC B_total;
-   ULong L2_total_m, L2_total_mr, L2_total_mw,
-         L2_total, L2_total_r, L2_total_w;
+   ULong LL_total_m, LL_total_mr, LL_total_mw,
+         LL_total, LL_total_r, LL_total_w;
    Int l1, l2, l3;
 
    fprint_CC_table_and_calc_totals();
@@ -1565,21 +1565,21 @@
       miss numbers */
    if (clo_cache_sim) {
       VG_(umsg)(fmt, "I1  misses:   ", Ir_total.m1);
-      VG_(umsg)(fmt, "L2i misses:   ", Ir_total.m2);
+      VG_(umsg)(fmt, "LLi misses:   ", Ir_total.mL);
 
       if (0 == Ir_total.a) Ir_total.a = 1;
       VG_(percentify)(Ir_total.m1, Ir_total.a, 2, l1+1, buf1);
       VG_(umsg)("I1  miss rate: %s\n", buf1);
 
-      VG_(percentify)(Ir_total.m2, Ir_total.a, 2, l1+1, buf1);
-      VG_(umsg)("L2i miss rate: %s\n", buf1);
+      VG_(percentify)(Ir_total.mL, Ir_total.a, 2, l1+1, buf1);
+      VG_(umsg)("LLi miss rate: %s\n", buf1);
       VG_(umsg)("\n");
 
       /* D cache results.  Use the D_refs.rd and D_refs.wr values to
        * determine the width of columns 2 & 3. */
       D_total.a  = Dr_total.a  + Dw_total.a;
       D_total.m1 = Dr_total.m1 + Dw_total.m1;
-      D_total.m2 = Dr_total.m2 + Dw_total.m2;
+      D_total.mL = Dr_total.mL + Dw_total.mL;
 
       /* Make format string, getting width right for numbers */
       VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu rd   + %%,%dllu wr)\n",
@@ -1589,8 +1589,8 @@
                      D_total.a, Dr_total.a, Dw_total.a);
       VG_(umsg)(fmt, "D1  misses:   ",
                      D_total.m1, Dr_total.m1, Dw_total.m1);
-      VG_(umsg)(fmt, "L2d misses:   ",
-                     D_total.m2, Dr_total.m2, Dw_total.m2);
+      VG_(umsg)(fmt, "LLd misses:   ",
+                     D_total.mL, Dr_total.mL, Dw_total.mL);
 
       if (0 == D_total.a)  D_total.a = 1;
       if (0 == Dr_total.a) Dr_total.a = 1;
@@ -1600,30 +1600,30 @@
       VG_(percentify)(Dw_total.m1, Dw_total.a, 1, l3+1, buf3);
       VG_(umsg)("D1  miss rate: %s (%s     + %s  )\n", buf1, buf2,buf3);
 
-      VG_(percentify)( D_total.m2,  D_total.a, 1, l1+1, buf1);
-      VG_(percentify)(Dr_total.m2, Dr_total.a, 1, l2+1, buf2);
-      VG_(percentify)(Dw_total.m2, Dw_total.a, 1, l3+1, buf3);
-      VG_(umsg)("L2d miss rate: %s (%s     + %s  )\n", buf1, buf2,buf3);
+      VG_(percentify)( D_total.mL,  D_total.a, 1, l1+1, buf1);
+      VG_(percentify)(Dr_total.mL, Dr_total.a, 1, l2+1, buf2);
+      VG_(percentify)(Dw_total.mL, Dw_total.a, 1, l3+1, buf3);
+      VG_(umsg)("LLd miss rate: %s (%s     + %s  )\n", buf1, buf2,buf3);
       VG_(umsg)("\n");
 
-      /* L2 overall results */
+      /* LL overall results */
 
-      L2_total   = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
-      L2_total_r = Dr_total.m1 + Ir_total.m1;
-      L2_total_w = Dw_total.m1;
-      VG_(umsg)(fmt, "L2 refs:      ",
-                     L2_total, L2_total_r, L2_total_w);
+      LL_total   = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
+      LL_total_r = Dr_total.m1 + Ir_total.m1;
+      LL_total_w = Dw_total.m1;
+      VG_(umsg)(fmt, "LL refs:      ",
+                     LL_total, LL_total_r, LL_total_w);
 
-      L2_total_m  = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
-      L2_total_mr = Dr_total.m2 + Ir_total.m2;
-      L2_total_mw = Dw_total.m2;
-      VG_(umsg)(fmt, "L2 misses:    ",
-                     L2_total_m, L2_total_mr, L2_total_mw);
+      LL_total_m  = Dr_total.mL + Dw_total.mL + Ir_total.mL;
+      LL_total_mr = Dr_total.mL + Ir_total.mL;
+      LL_total_mw = Dw_total.mL;
+      VG_(umsg)(fmt, "LL misses:    ",
+                     LL_total_m, LL_total_mr, LL_total_mw);
 
-      VG_(percentify)(L2_total_m,  (Ir_total.a + D_total.a),  1, l1+1, buf1);
-      VG_(percentify)(L2_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
-      VG_(percentify)(L2_total_mw, Dw_total.a,                1, l3+1, buf3);
-      VG_(umsg)("L2 miss rate:  %s (%s     + %s  )\n", buf1, buf2,buf3);
+      VG_(percentify)(LL_total_m,  (Ir_total.a + D_total.a),  1, l1+1, buf1);
+      VG_(percentify)(LL_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
+      VG_(percentify)(LL_total_mw, Dw_total.a,                1, l3+1, buf3);
+      VG_(umsg)("LL miss rate:  %s (%s     + %s  )\n", buf1, buf2,buf3);
    }
 
    /* If branch profiling is enabled, show branch overall results. */
@@ -1760,8 +1760,9 @@
       parse_cache_opt(&clo_I1_cache, arg, tmp_str);
    else if VG_STR_CLO(arg, "--D1", tmp_str)
       parse_cache_opt(&clo_D1_cache, arg, tmp_str);
-   else if VG_STR_CLO(arg, "--L2", tmp_str)
-      parse_cache_opt(&clo_L2_cache, arg, tmp_str);
+   else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
+            VG_STR_CLO(arg, "--LL", tmp_str))
+      parse_cache_opt(&clo_LL_cache, arg, tmp_str);
 
    else if VG_STR_CLO( arg, "--cachegrind-out-file", clo_cachegrind_out_file) {}
    else if VG_BOOL_CLO(arg, "--cache-sim",  clo_cache_sim)  {}
@@ -1777,7 +1778,7 @@
    VG_(printf)(
 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
-"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
+"    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
 "    --cache-sim=yes|no  [yes]        collect cache stats?\n"
 "    --branch-sim=yes|no [no]         collect branch prediction stats?\n"
 "    --cachegrind-out-file=<file>     output file name [cachegrind.out.%%p]\n"
@@ -1819,7 +1820,7 @@
 
 static void cg_post_clo_init(void)
 {
-   cache_t I1c, D1c, L2c; 
+   cache_t I1c, D1c, LLc; 
 
    CC_table =
       VG_(OSetGen_Create)(offsetof(LineCC, loc),
@@ -1837,11 +1838,11 @@
                           VG_(malloc), "cg.main.cpci.3",
                           VG_(free));
 
-   configure_caches(&I1c, &D1c, &L2c);
+   configure_caches(&I1c, &D1c, &LLc);
 
    cachesim_I1_initcache(I1c);
    cachesim_D1_initcache(D1c);
-   cachesim_L2_initcache(L2c);
+   cachesim_LL_initcache(LLc);
 }
 
 VG_DETERMINE_INTERFACE_VERSION(cg_pre_clo_init)
diff --git a/cachegrind/cg_sim.c b/cachegrind/cg_sim.c
index a55a1e4..0b8a1d7 100644
--- a/cachegrind/cg_sim.c
+++ b/cachegrind/cg_sim.c
@@ -96,7 +96,7 @@
 /* bigger than its usual limit.  Inlining gains around 5--10% speedup. */   \
 __attribute__((always_inline))                                              \
 static __inline__                                                           \
-void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
+void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *mL)         \
 {                                                                           \
    UInt  set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);          \
    UInt  set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);          \
@@ -188,9 +188,9 @@
    return;                                                                  \
 }
 
-CACHESIM(L2, (*m2)++ );
-CACHESIM(I1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
-CACHESIM(D1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
+CACHESIM(LL, (*mL)++ );
+CACHESIM(I1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
+CACHESIM(D1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                 cg_sim.c ---*/
diff --git a/cachegrind/docs/cg-manual.xml b/cachegrind/docs/cg-manual.xml
index b312771..b5a820b 100644
--- a/cachegrind/docs/cg-manual.xml
+++ b/cachegrind/docs/cg-manual.xml
@@ -16,33 +16,45 @@
 
 <para>Cachegrind simulates how your program interacts with a machine's cache
 hierarchy and (optionally) branch predictor.  It simulates a machine with
-independent first level instruction and data caches (I1 and D1), backed by a
-unified second level cache (L2).  This configuration is used by almost all
-modern machines.</para>
+independent first-level instruction and data caches (I1 and D1), backed by a
+unified second-level cache (L2).  This exactly matches the configuration of
+many modern machines.</para>
+
+<para>However, some modern machines have three levels of cache.  For these
+machines (in the cases where Cachegrind can auto-detect the cache
+configuration) Cachegrind simulates the first-level and third-level caches.
+The reason for this choice is that the L3 cache has the most influence on
+runtime, as it masks accesses to main memory.  Furthermore, the L1 caches
+often have low associativity, so simulating them can detect cases where the
+code interacts badly with this cache (eg. traversing a matrix column-wise
+with the row length being a power of 2).</para>
+
+<para>Therefore, Cachegrind always refers to the I1, D1 and LL (last-level)
+caches.</para>
 
 <para>
-It gathers the following statistics (abbreviations used for each statistic
+Cachegrind gathers the following statistics (abbreviations used for each statistic
 is given in parentheses):</para>
 <itemizedlist>
   <listitem>
     <para>I cache reads (<computeroutput>Ir</computeroutput>,
     which equals the number of instructions executed),
     I1 cache read misses (<computeroutput>I1mr</computeroutput>) and
-    L2 cache instruction read misses (<computeroutput>I1mr</computeroutput>).
+    LL cache instruction read misses (<computeroutput>ILmr</computeroutput>).
     </para>
   </listitem>
   <listitem>
     <para>D cache reads (<computeroutput>Dr</computeroutput>, which
     equals the number of memory reads),
     D1 cache read misses (<computeroutput>D1mr</computeroutput>), and
-    L2 cache data read misses (<computeroutput>D2mr</computeroutput>).
+    LL cache data read misses (<computeroutput>DLmr</computeroutput>).
     </para>
   </listitem>
   <listitem>
     <para>D cache writes (<computeroutput>Dw</computeroutput>, which equals
     the number of memory writes),
     D1 cache write misses (<computeroutput>D1mw</computeroutput>), and
-    L2 cache data write misses (<computeroutput>D2mw</computeroutput>).
+    LL cache data write misses (<computeroutput>DLmw</computeroutput>).
     </para>
   </listitem>
   <listitem>
@@ -59,10 +71,10 @@
 
 <para>Note that D1 total accesses is given by
 <computeroutput>D1mr</computeroutput> +
-<computeroutput>D1mw</computeroutput>, and that L2 total
-accesses is given by <computeroutput>I2mr</computeroutput> +
-<computeroutput>D2mr</computeroutput> +
-<computeroutput>D2mw</computeroutput>.
+<computeroutput>D1mw</computeroutput>, and that LL total
+accesses is given by <computeroutput>ILmr</computeroutput> +
+<computeroutput>DLmr</computeroutput> +
+<computeroutput>DLmw</computeroutput>.
 </para>
 
 <para>These statistics are presented for the entire program and for each
@@ -70,7 +82,7 @@
 the program with the counts that were caused directly by it.</para>
 
 <para>On a modern machine, an L1 miss will typically cost
-around 10 cycles, an L2 miss can cost as much as 200
+around 10 cycles, an LL miss can cost as much as 200
 cycles, and a mispredicted branch costs in the region of 10
 to 30 cycles.  Detailed cache and branch profiling can be very useful
 for understanding how your program interacts with the machine and thus how
@@ -118,24 +130,24 @@
 <programlisting><![CDATA[
 ==31751== I   refs:      27,742,716
 ==31751== I1  misses:           276
-==31751== L2i misses:           275
+==31751== LLi misses:           275
 ==31751== I1  miss rate:        0.0%
-==31751== L2i miss rate:        0.0%
+==31751== LLi miss rate:        0.0%
 ==31751== 
 ==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
 ==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
-==31751== L2d misses:        23,085  (     3,987 rd +    19,098 wr)
+==31751== LLd misses:        23,085  (     3,987 rd +    19,098 wr)
 ==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
-==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
+==31751== LLd miss rate:        0.1% (       0.0%   +       0.4%)
 ==31751== 
-==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
-==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)]]></programlisting>
+==31751== LL misses:         23,360  (     4,262 rd +    19,098 wr)
+==31751== LL miss rate:         0.0% (       0.0%   +       0.4%)]]></programlisting>
 
 <para>Cache accesses for instruction fetches are summarised
 first, giving the number of fetches made (this is the number of
 instructions executed, which can be useful to know in its own
-right), the number of I1 misses, and the number of L2 instruction
-(<computeroutput>L2i</computeroutput>) misses.</para>
+right), the number of I1 misses, and the number of LL instruction
+(<computeroutput>LLi</computeroutput>) misses.</para>
 
 <para>Cache accesses for data follow. The information is similar
 to that of the instruction fetches, except that the values are
@@ -144,12 +156,12 @@
 <computeroutput>wr</computeroutput> values add up to the row's
 total).</para>
 
-<para>Combined instruction and data figures for the L2 cache
-follow that.  Note that the L2 miss rate is computed relative to the total
+<para>Combined instruction and data figures for the LL cache
+follow that.  Note that the LL miss rate is computed relative to the total
 number of memory accesses, not the number of L1 misses.  I.e.  it is
-<computeroutput>(I2mr + D2mr + D2mw) / (Ir + Dr + Dw)</computeroutput>
+<computeroutput>(ILmr + DLmr + DLmw) / (Ir + Dr + Dw)</computeroutput>
 not
-<computeroutput>(I2mr + D2mr + D2mw) / (I1mr + D1mr + D1mw)</computeroutput>
+<computeroutput>(ILmr + DLmr + DLmw) / (I1mr + D1mr + D1mw)</computeroutput>
 </para>
 
 <para>Branch prediction statistics are not collected by default.
@@ -208,11 +220,11 @@
 --------------------------------------------------------------------------------
 I1 cache:              65536 B, 64 B, 2-way associative
 D1 cache:              65536 B, 64 B, 2-way associative
-L2 cache:              262144 B, 64 B, 8-way associative
+LL cache:              262144 B, 64 B, 8-way associative
 Command:               concord vg_to_ucode.c
-Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Events recorded:       Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
+Events shown:          Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
+Event sort order:      Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
 Threshold:             99%
 Chosen for annotation:
 Auto-annotation:       off
@@ -224,7 +236,7 @@
 <itemizedlist>
 
   <listitem>
-    <para>I1 cache, D1 cache, L2 cache: cache configuration.  So
+    <para>I1 cache, D1 cache, LL cache: cache configuration.  So
     you know the configuration with which these results were
     obtained.</para>
   </listitem>
@@ -300,7 +312,7 @@
   
 <programlisting><![CDATA[
 --------------------------------------------------------------------------------
-Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
+Ir         I1mr ILmr Dr         D1mr   DLmr  Dw        D1mw   DLmw
 --------------------------------------------------------------------------------
 27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS]]></programlisting>
 
@@ -312,7 +324,7 @@
 
 <programlisting><![CDATA[
 --------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
+Ir        I1mr ILmr Dr        D1mr  DLmr  Dw        D1mw   DLmw    file:function
 --------------------------------------------------------------------------------
 8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
 5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
@@ -367,7 +379,7 @@
 --------------------------------------------------------------------------------
 -- User-annotated source: concord.c
 --------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
+Ir        I1mr ILmr Dr      D1mr  DLmr  Dw      D1mw   DLmw
 
         .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
         3    1    1       .     .     .       1      0      0  {
@@ -687,7 +699,7 @@
 <computeroutput>Events:</computeroutput> lines of all the inputs are
 identical, so as to ensure that the addition of costs makes sense.
 For example, it would be nonsensical for it to add a number indicating
-D1 read references to a number from a different file indicating L2
+D1 read references to a number from a different file indicating LL
 write misses.</para>
 
 <para>
@@ -746,7 +758,7 @@
 <computeroutput>Events:</computeroutput> lines of all the inputs are
 identical, so as to ensure that the addition of costs makes sense.
 For example, it would be nonsensical for it to add a number indicating
-D1 read references to a number from a different file indicating L2
+D1 read references to a number from a different file indicating LL
 write misses.</para>
 
 <para>
@@ -810,12 +822,12 @@
     </listitem>
   </varlistentry>
 
-  <varlistentry id="opt.L2" xreflabel="--L2">
+  <varlistentry id="opt.LL" xreflabel="--LL">
     <term>
-      <option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
+      <option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
     </term>
     <listitem>
-      <para>Specify the size, associativity and line size of the level 2
+      <para>Specify the size, associativity and line size of the last-level
       cache.</para>
     </listitem>
   </varlistentry>
@@ -903,9 +915,9 @@
       order). Default is to use all present in the
       <filename>cachegrind.out.&lt;pid&gt;</filename> file (and
       use the order in the file).  Useful if you want to concentrate on, for
-      example, I cache misses (<option>--show=I1mr,I2mr</option>), or data
-      read misses (<option>--show=D1mr,D2mr</option>), or L2 data misses
-      (<option>--show=D2mr,D2mw</option>).  Best used in conjunction with
+      example, I cache misses (<option>--show=I1mr,ILmr</option>), or data
+      read misses (<option>--show=D1mr,DLmr</option>), or LL data misses
+      (<option>--show=DLmr,DLmw</option>).  Best used in conjunction with
       <option>--sort</option>.</para>
     </listitem>
   </varlistentry>
@@ -935,9 +947,9 @@
       events by appending any events for the
       <option>--sort</option> option with a colon
       and a number (no spaces, though).  E.g. if you want to see
-      each function that covers more than 1% of L2 read misses or 1% of L2
+      each function that covers more than 1% of LL read misses or 1% of LL
       write misses, use this option:</para>
-      <para><option>--sort=D2mr:1,D2mw:1</option></para>
+      <para><option>--sort=DLmr:1,DLmw:1</option></para>
     </listitem>
   </varlistentry>
 
@@ -1059,13 +1071,13 @@
 bottlenecks.</para>
 
 <para>
-After that, we have found that L2 misses are typically a much bigger source
+After that, we have found that LL misses are typically a much bigger source
 of slow-downs than L1 misses.  So it's worth looking for any snippets of
-code with high <computeroutput>D2mr</computeroutput> or
-<computeroutput>D2mw</computeroutput> counts.  (You can use
-<option>--show=D2mr
---sort=D2mr</option> with cg_annotate to focus just on
-<literal>D2mr</literal> counts, for example.) If you find any, it's still
+code with high <computeroutput>DLmr</computeroutput> or
+<computeroutput>DLmw</computeroutput> counts.  (You can use
+<option>--show=DLmr
+--sort=DLmr</option> with cg_annotate to focus just on
+<literal>DLmr</literal> counts, for example.) If you find any, it's still
 not always easy to work out how to improve things.  You need to have a
 reasonable understanding of how caches work, the principles of locality, and
 your program's data access patterns.  Improving things may require
@@ -1153,12 +1165,12 @@
   </listitem>
 
   <listitem>
-    <para>Inclusive L2 cache: the L2 cache typically replicates all
+    <para>Inclusive LL cache: the LL cache typically replicates all
     the entries of the L1 caches, because fetching into L1 involves
-    fetching into L2 first (this does not guarantee strict inclusiveness,
-    as lines evicted from L2 still could reside in L1).  This is
+    fetching into LL first (this does not guarantee strict inclusiveness,
+    as lines evicted from LL still could reside in L1).  This is
     standard on Pentium chips, but AMD Opterons, Athlons and Durons
-    use an exclusive L2 cache that only holds
+    use an exclusive LL cache that only holds
     blocks evicted from L1.  Ditto most modern VIA CPUs.</para>
   </listitem>
 
@@ -1172,10 +1184,10 @@
 Cachegrind will fall back to using a default configuration (that
 of a model 3/4 Athlon).  Cachegrind will tell you if this
 happens.  You can manually specify one, two or all three levels
-(I1/D1/L2) of the cache from the command line using the
+(I1/D1/LL) of the cache from the command line using the
 <option>--I1</option>,
 <option>--D1</option> and
-<option>--L2</option> options.
+<option>--LL</option> options.
 For cache parameters to be valid for simulation, the number
 of sets (with associativity being the number of cache lines in
 each set) has to be a power of two.</para>
@@ -1186,7 +1198,7 @@
 need to specify it with the
 <option>--I1</option>,
 <option>--D1</option> and
-<option>--L2</option> options.</para>
+<option>--LL</option> options.</para>
 
 
 <para>Other noteworthy behaviour:</para>
diff --git a/cachegrind/tests/chdir.stderr.exp b/cachegrind/tests/chdir.stderr.exp
index 8eaf654..e8084c1 100644
--- a/cachegrind/tests/chdir.stderr.exp
+++ b/cachegrind/tests/chdir.stderr.exp
@@ -2,16 +2,16 @@
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/dlclose.stderr.exp b/cachegrind/tests/dlclose.stderr.exp
index 8eaf654..e8084c1 100644
--- a/cachegrind/tests/dlclose.stderr.exp
+++ b/cachegrind/tests/dlclose.stderr.exp
@@ -2,16 +2,16 @@
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/filter_stderr b/cachegrind/tests/filter_stderr
index 6ec44bf..6a14e34 100755
--- a/cachegrind/tests/filter_stderr
+++ b/cachegrind/tests/filter_stderr
@@ -7,11 +7,11 @@
 # Remove "Cachegrind, ..." line and the following copyright line.
 sed "/^Cachegrind, a cache and branch-prediction profiler/ , /./ d" |
 
-# Remove numbers from I/D/L2 "refs:" lines
-perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/'  |
+# Remove numbers from I/D/LL "refs:" lines
+perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/'  |
 
-# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
-perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
+# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
+perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
 
 # Remove CPUID warnings lines for P4s and other machines
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
diff --git a/cachegrind/tests/notpower2.stderr.exp b/cachegrind/tests/notpower2.stderr.exp
index 8eaf654..e8084c1 100644
--- a/cachegrind/tests/notpower2.stderr.exp
+++ b/cachegrind/tests/notpower2.stderr.exp
@@ -2,16 +2,16 @@
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/notpower2.vgtest b/cachegrind/tests/notpower2.vgtest
index 132cfe5..21caffe 100644
--- a/cachegrind/tests/notpower2.vgtest
+++ b/cachegrind/tests/notpower2.vgtest
@@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
 cleanup: rm cachegrind.out.*
diff --git a/cachegrind/tests/wrap5.stderr.exp b/cachegrind/tests/wrap5.stderr.exp
index 8eaf654..e8084c1 100644
--- a/cachegrind/tests/wrap5.stderr.exp
+++ b/cachegrind/tests/wrap5.stderr.exp
@@ -2,16 +2,16 @@
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/x86/fpu-28-108.stderr.exp b/cachegrind/tests/x86/fpu-28-108.stderr.exp
index 8eaf654..e8084c1 100644
--- a/cachegrind/tests/x86/fpu-28-108.stderr.exp
+++ b/cachegrind/tests/x86/fpu-28-108.stderr.exp
@@ -2,16 +2,16 @@
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/docs/cl-format.xml b/callgrind/docs/cl-format.xml
index 97b3543..7fce318 100644
--- a/callgrind/docs/cl-format.xml
+++ b/callgrind/docs/cl-format.xml
@@ -414,7 +414,7 @@
     <para>This specifies various information for this dump.  For some 
     types, the semantic is defined, but any description type is allowed. 
     Unknown types should be ignored.</para>
-    <para>There are the types "I1 cache", "D1 cache", "L2 cache", which 
+    <para>There are the types "I1 cache", "D1 cache", "LL cache", which 
     specify parameters used for the cache simulator.  These are the only
     types originally used by Cachegrind.  Additionally, Callgrind uses 
     the following types:  "Timerange" gives a rough range of the basic
@@ -457,7 +457,7 @@
           <para><command>I1mr</command>: Instruction Level 1 read cache miss</para>
         </listitem>
         <listitem>
-          <para><command>I2mr</command>: Instruction Level 2 read cache miss</para>
+          <para><command>ILmr</command>: Instruction last-level read cache miss</para>
         </listitem>
         <listitem>
           <para>...</para>
diff --git a/callgrind/docs/cl-manual.xml b/callgrind/docs/cl-manual.xml
index e2289ff..3f8330e 100644
--- a/callgrind/docs/cl-manual.xml
+++ b/callgrind/docs/cl-manual.xml
@@ -933,9 +933,9 @@
       <para>Specify if you want to do full cache simulation.  By default,
       only instruction read accesses will be counted ("Ir").
       With cache simulation, further event counters are enabled:
-      Cache misses on instruction reads ("I1mr"/"I2mr"),
-      data read accesses ("Dr") and related cache misses ("D1mr"/"D2mr"),
-      data write accesses ("Dw") and related cache misses ("D1mw"/"D2mw").
+      Cache misses on instruction reads ("I1mr"/"ILmr"),
+      data read accesses ("Dr") and related cache misses ("D1mr"/"DLmr"),
+      data write accesses ("Dw") and related cache misses ("D1mw"/"DLmw").
       For more information, see <xref linkend="cg-manual"/>.
       </para>
     </listitem>
@@ -972,13 +972,13 @@
     </term>
     <listitem>
       <para>Specify whether write-back behavior should be simulated, allowing
-      to distinguish L2 caches misses with and without write backs.
+      to distinguish LL caches misses with and without write backs.
       The cache model of Cachegrind/Callgrind does not specify write-through
       vs. write-back behavior, and this also is not relevant for the number
       of generated miss counts. However, with explicit write-back simulation
       it can be decided whether a miss triggers not only the loading of a new
       cache line, but also if a write back of a dirty cache line had to take
-      place before. The new dirty miss events are I2dmr, D2dmr, and D2dmw,
+      place before. The new dirty miss events are ILdmr, DLdmr, and DLdmw,
       for misses because of instruction read, data read, and data write,
       respectively. As they produce two memory transactions, they should
       account for a doubled time estimation in relation to a normal miss.
@@ -1016,13 +1016,13 @@
       bad access behavior). The new counters are defined in a way such
       that worse behavior results in higher cost.
       AcCost1 and AcCost2 are counters showing bad temporal locality
-      for L1 and L2 caches, respectively. This is done by summing up
+      for L1 and LL caches, respectively. This is done by summing up
       reciprocal values of the numbers of accesses of each cache line,
       multiplied by 1000 (as only integer costs are allowed). E.g. for
       a given source line with 5 read accesses, a value of 5000 AcCost
       means that for every access, a new cache line was loaded and directly
       evicted afterwards without further accesses. Similarly, SpLoss1/2
-      shows bad spatial locality for L1 and L2 caches, respectively. It
+      shows bad spatial locality for L1 and LL caches, respectively. It
       gives the <emphasis>spatial loss</emphasis> count of bytes which
       were loaded into cache but never accessed. It pinpoints at code
       accessing data in a way such that cache space is wasted. This hints
@@ -1059,12 +1059,12 @@
     </listitem>
   </varlistentry>
 
-  <varlistentry id="opt.L2" xreflabel="--L2">
+  <varlistentry id="opt.LL" xreflabel="--LL">
     <term>
-      <option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
+      <option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
     </term>
     <listitem>
-      <para>Specify the size, associativity and line size of the level 2
+      <para>Specify the size, associativity and line size of the last-level
       cache.</para>
     </listitem>
   </varlistentry>
diff --git a/callgrind/sim.c b/callgrind/sim.c
index cb41d57..2b8cbe4 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -91,7 +91,7 @@
  * States of flat caches in our model.
  * We use a 2-level hierarchy, 
  */
-static cache_t2 I1, D1, L2;
+static cache_t2 I1, D1, LL;
 
 /* Lower bits of cache tags are used as flags for a cache line */
 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
@@ -123,8 +123,8 @@
 static Int off_I1_SpLoss  = 1;
 static Int off_D1_AcCost  = 0;
 static Int off_D1_SpLoss  = 1;
-static Int off_L2_AcCost  = 2;
-static Int off_L2_SpLoss  = 3;
+static Int off_LL_AcCost  = 2;
+static Int off_LL_SpLoss  = 3;
 
 /* Cache access types */
 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
@@ -135,7 +135,7 @@
 /* Result of a reference into a hierarchical cache model */
 typedef enum {
     L1_Hit, 
-    L2_Hit,
+    LL_Hit,
     MemAccess,
     WriteBackMemAccess } CacheModelResult;
 
@@ -231,7 +231,7 @@
 /*------------------------------------------------------------*/
 
 /*
- * Simple model: L1 & L2 Write Through
+ * Simple model: L1 & LL Write Through
  * Does not distinguish among read and write references
  *
  * Simulator functions:
@@ -305,7 +305,7 @@
 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 {
     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
     return MemAccess;
 }
 
@@ -313,7 +313,7 @@
 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 {
     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
     return MemAccess;
 }
 
@@ -323,7 +323,7 @@
 /*------------------------------------------------------------*/
 
 /*
- * More complex model: L1 Write-through, L2 Write-back
+ * More complex model: L1 Write-through, LL Write-back
  * This needs to distinguish among read and write references.
  *
  * Simulator functions:
@@ -412,8 +412,8 @@
 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 {
     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
-	case Hit: return L2_Hit;
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
     }
@@ -424,8 +424,8 @@
 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 {
     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
-	case Hit: return L2_Hit;
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
     }
@@ -437,14 +437,14 @@
 {
     if ( cachesim_ref( &D1, a, size) == Hit ) {
 	/* Even for a L1 hit, the write-trough L1 passes
-	 * the write to the L2 to make the L2 line dirty.
+	 * the write to the LL to make the LL line dirty.
 	 * But this causes no latency, so return the hit.
 	 */
-	cachesim_ref_wb( &L2, Write, a, size);
+	cachesim_ref_wb( &LL, Write, a, size);
 	return L1_Hit;
     }
-    switch( cachesim_ref_wb( &L2, Write, a, size) ) {
-	case Hit: return L2_Hit;
+    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
     }
@@ -479,10 +479,10 @@
  * One stream can be detected per 4k page.
  */
 static __inline__
-void prefetch_L2_doref(Addr a)
+void prefetch_LL_doref(Addr a)
 {
   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
-  UInt block = ( a >> L2.line_size_bits);
+  UInt block = ( a >> LL.line_size_bits);
 
   if (block != pf_lastblock[stream]) {
     if (pf_seqblocks[stream] == 0) {
@@ -494,7 +494,7 @@
 	pf_seqblocks[stream]++;
 	if (pf_seqblocks[stream] >= 2) {
 	  prefetch_up++;
-	  cachesim_ref(&L2, a + 5 * L2.line_size,1);
+	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
 	}
       }
       else pf_seqblocks[stream] = 0;
@@ -504,7 +504,7 @@
 	pf_seqblocks[stream]--;
 	if (pf_seqblocks[stream] <= -2) {
 	  prefetch_down++;
-	  cachesim_ref(&L2, a - 5 * L2.line_size,1);
+	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
 	}
       }
       else pf_seqblocks[stream] = 0;
@@ -519,8 +519,8 @@
 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
 {
     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    prefetch_L2_doref(a);
-    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    prefetch_LL_doref(a);
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
     return MemAccess;
 }
 
@@ -528,8 +528,8 @@
 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
 {
     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    prefetch_L2_doref(a);
-    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    prefetch_LL_doref(a);
+    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
     return MemAccess;
 }
 
@@ -540,9 +540,9 @@
 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
 {
     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
-    prefetch_L2_doref(a);
-    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
-	case Hit: return L2_Hit;
+    prefetch_LL_doref(a);
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
     }
@@ -553,9 +553,9 @@
 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
 {
     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
-    prefetch_L2_doref(a);
-    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
-	case Hit: return L2_Hit;
+    prefetch_LL_doref(a);
+    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
     }
@@ -565,17 +565,17 @@
 static
 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
 {
-    prefetch_L2_doref(a);
+    prefetch_LL_doref(a);
     if ( cachesim_ref( &D1, a, size) == Hit ) {
 	/* Even for a L1 hit, the write-trough L1 passes
-	 * the write to the L2 to make the L2 line dirty.
+	 * the write to the LL to make the LL line dirty.
 	 * But this causes no latency, so return the hit.
 	 */
-	cachesim_ref_wb( &L2, Write, a, size);
+	cachesim_ref_wb( &LL, Write, a, size);
 	return L1_Hit;
     }
-    switch( cachesim_ref_wb( &L2, Write, a, size) ) {
-	case Hit: return L2_Hit;
+    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
+	case Hit: return LL_Hit;
 	case Miss: return MemAccess;
 	default: break;
     }
@@ -736,7 +736,7 @@
    /* Second case: word straddles two lines. */                             \
    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
-      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
+      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
       set = &(L.tags[set1 * L.assoc]);                                      \
       use_mask = L.line_start_mask[a & L.line_size_mask];		    \
       if (tag == (set[0] & L.tag_mask)) {                                   \
@@ -809,7 +809,7 @@
       idx = (set2 * L.assoc) + tmp_tag;                                     \
       miss2 = update_##L##_use(&L, idx,			                    \
 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
-      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit;     \
+      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
                                                                             \
    } else {                                                                 \
        VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
@@ -837,13 +837,13 @@
   return c;
 }
 
-static void update_L2_use(int idx, Addr memline)
+static void update_LL_use(int idx, Addr memline)
 {
-  line_loaded* loaded = &(L2.loaded[idx]);
-  line_use* use = &(L2.use[idx]);
-  int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
+  line_loaded* loaded = &(LL.loaded[idx]);
+  line_use* use = &(LL.use[idx]);
+  int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
   
-  CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
+  CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
            idx, CLG_(bb_base) + current_ii->instr_offset, memline);
   if (use->count>0) {
     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
@@ -852,8 +852,8 @@
 	     CLG_(current_state).collect, loaded->use_base);
     
     if (CLG_(current_state).collect && loaded->use_base) {
-      (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
-      (loaded->use_base)[off_L2_SpLoss] += i;
+      (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
+      (loaded->use_base)[off_LL_SpLoss] += i;
     }
    }
 
@@ -868,53 +868,53 @@
 }
 
 static
-CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
+CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
 {
-   UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
-   UWord* set = &(L2.tags[setNo * L2.assoc]);
-   UWord tag  = memline & L2.tag_mask;
+   UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
+   UWord* set = &(LL.tags[setNo * LL.assoc]);
+   UWord tag  = memline & LL.tag_mask;
 
    int i, j, idx;
    UWord tmp_tag;
    
-   CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
+   CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
 
-   if (tag == (set[0] & L2.tag_mask)) {
-     idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
-     l1_loaded->dep_use = &(L2.use[idx]);
+   if (tag == (set[0] & LL.tag_mask)) {
+     idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
+     l1_loaded->dep_use = &(LL.use[idx]);
 
      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
-		 idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
-		 L2.use[idx].mask, L2.use[idx].count);
-     return L2_Hit;
+		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
+		 LL.use[idx].mask, LL.use[idx].count);
+     return LL_Hit;
    }
-   for (i = 1; i < L2.assoc; i++) {
-     if (tag == (set[i] & L2.tag_mask)) {
+   for (i = 1; i < LL.assoc; i++) {
+     if (tag == (set[i] & LL.tag_mask)) {
        tmp_tag = set[i];
        for (j = i; j > 0; j--) {
 	 set[j] = set[j - 1];
        }
        set[0] = tmp_tag;
-       idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
-       l1_loaded->dep_use = &(L2.use[idx]);
+       idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
+       l1_loaded->dep_use = &(LL.use[idx]);
 
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
-		 i, idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
-		 L2.use[idx].mask, L2.use[idx].count);
-	return L2_Hit;
+		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
+		 LL.use[idx].mask, LL.use[idx].count);
+	return LL_Hit;
      }
    }
 
    /* A miss;  install this tag as MRU, shuffle rest down. */
-   tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
-   for (j = L2.assoc - 1; j > 0; j--) {
+   tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
+   for (j = LL.assoc - 1; j > 0; j--) {
      set[j] = set[j - 1];
    }
    set[0] = tag | tmp_tag;
-   idx = (setNo * L2.assoc) + tmp_tag;
-   l1_loaded->dep_use = &(L2.use[idx]);
+   idx = (setNo * LL.assoc) + tmp_tag;
+   l1_loaded->dep_use = &(LL.use[idx]);
 
-   update_L2_use(idx, memline);
+   update_LL_use(idx, memline);
 
    return MemAccess;
 }
@@ -943,7 +943,7 @@
       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
                                                                      \
-      /* FIXME (?): L1/L2 line sizes must be equal ! */              \
+      /* FIXME (?): L1/LL line sizes must be equal ! */              \
       loaded->dep_use->mask |= use->mask;                            \
       loaded->dep_use->count += use->count;                          \
     }                                                                \
@@ -957,8 +957,8 @@
     CLG_(current_state).nonskipped->skipped :                        \
     CLG_(cost_base) + current_ii->cost_offset;                       \
                                                                      \
-  if (memline == 0) return L2_Hit;                                   \
-  return cacheuse_L2_access(memline, loaded);                        \
+  if (memline == 0) return LL_Hit;                                   \
+  return cacheuse_LL_access(memline, loaded);                        \
 }
 
 UPDATE_USE(I1);
@@ -991,10 +991,10 @@
       if (D1.loaded[i].use_base)
 	update_D1_use( &D1, i, 0,0);
 
-  if (L2.use)
-    for (i = 0; i < L2.sets * L2.assoc; i++)
-      if (L2.loaded[i].use_base)
-	update_L2_use(i, 0);
+  if (LL.use)
+    for (i = 0; i < LL.sets * LL.assoc; i++)
+      if (LL.loaded[i].use_base)
+	update_LL_use(i, 0);
 }
   
 
@@ -1020,7 +1020,7 @@
 	    c2[2]++;
 	    // fall through
 
-	case L2_Hit:
+	case LL_Hit:
 	    c1[1]++;
 	    c2[1]++;
 	    // fall through
@@ -1036,9 +1036,9 @@
 {
     switch(r) {
     case L1_Hit:    return "L1 Hit ";
-    case L2_Hit:    return "L2 Hit ";
-    case MemAccess: return "L2 Miss";
-    case WriteBackMemAccess: return "L2 Miss (dirty)";
+    case LL_Hit:    return "LL Hit ";
+    case MemAccess: return "LL Miss";
+    case WriteBackMemAccess: return "LL Miss (dirty)";
     default:
 	tl_assert(0);
     }
@@ -1268,7 +1268,7 @@
 
 static cache_t clo_I1_cache = UNDEFINED_CACHE;
 static cache_t clo_D1_cache = UNDEFINED_CACHE;
-static cache_t clo_L2_cache = UNDEFINED_CACHE;
+static cache_t clo_LL_cache = UNDEFINED_CACHE;
 
 
 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
@@ -1308,7 +1308,7 @@
 }
 
 static
-void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
 #define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)
 
@@ -1317,30 +1317,30 @@
    Bool all_caches_clo_defined =
       (DEFINED(clo_I1_cache) &&
        DEFINED(clo_D1_cache) &&
-       DEFINED(clo_L2_cache));
+       DEFINED(clo_LL_cache));
 
    // Set the cache config (using auto-detection, if supported by the
    // architecture).
-   VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
+   VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
 
    // Check the default/auto-detected values.
    checkRes = check_cache(I1c);  tl_assert(!checkRes);
    checkRes = check_cache(D1c);  tl_assert(!checkRes);
-   checkRes = check_cache(L2c);  tl_assert(!checkRes);
+   checkRes = check_cache(LLc);  tl_assert(!checkRes);
 
    // Then replace with any defined on the command line.
    if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
    if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
-   if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
+   if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
 
    if (VG_(clo_verbosity) > 1) {
-      VG_(message)(Vg_UserMsg, "Cache configuration used:\n");
-      VG_(message)(Vg_UserMsg, "  I1: %dB, %d-way, %dB lines\n",
-                               I1c->size, I1c->assoc, I1c->line_size);
-      VG_(message)(Vg_UserMsg, "  D1: %dB, %d-way, %dB lines\n",
-                               D1c->size, D1c->assoc, D1c->line_size);
-      VG_(message)(Vg_UserMsg, "  L2: %dB, %d-way, %dB lines\n",
-                               L2c->size, L2c->assoc, L2c->line_size);
+      VG_(umsg)("Cache configuration used:\n");
+      VG_(umsg)("  I1: %dB, %d-way, %dB lines\n",
+                I1c->size, I1c->assoc, I1c->line_size);
+      VG_(umsg)("  D1: %dB, %d-way, %dB lines\n",
+                D1c->size, D1c->assoc, D1c->line_size);
+      VG_(umsg)("  LL: %dB, %d-way, %dB lines\n",
+                LLc->size, LLc->assoc, LLc->line_size);
    }
 #undef CMD_LINE_DEFINED
 }
@@ -1350,7 +1350,7 @@
 static void cachesim_post_clo_init(void)
 {
   /* Cache configurations. */
-  cache_t  I1c, D1c, L2c;
+  cache_t  I1c, D1c, LLc;
 
   /* Initialize access handlers */
   if (!CLG_(clo).simulate_cache) {
@@ -1374,15 +1374,15 @@
   }
 
   /* Configuration of caches only needed with real cache simulation */
-  configure_caches(&I1c, &D1c, &L2c);
+  configure_caches(&I1c, &D1c, &LLc);
   
   I1.name = "I1";
   D1.name = "D1";
-  L2.name = "L2";
+  LL.name = "LL";
 
   cachesim_initcache(I1c, &I1);
   cachesim_initcache(D1c, &D1);
-  cachesim_initcache(L2c, &L2);
+  cachesim_initcache(LLc, &LL);
 
   /* the other cache simulators use the standard helpers
    * with dispatching via simulator struct */
@@ -1463,7 +1463,7 @@
 {
   cachesim_clearcache(&I1);
   cachesim_clearcache(&D1);
-  cachesim_clearcache(&L2);
+  cachesim_clearcache(&LL);
 
   prefetch_clear();
 }
@@ -1474,7 +1474,7 @@
   Int p;
   p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
   p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
-  VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
+  VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
 }
 
 static
@@ -1490,11 +1490,12 @@
 "    --cacheuse=no|yes         Collect cache block use [no]\n"
 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
-"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
+"    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
 	      );
 }
 
-static void parse_opt ( cache_t* cache, char* opt, Char* optval )
+static void parse_opt ( cache_t* cache,
+                        char* opt, Char* optval, UChar kind )
 {
    Long i1, i2, i3;
    Char* endptr;
@@ -1550,11 +1551,12 @@
    }
 
    else if VG_STR_CLO(arg, "--I1", tmp_str)
-      parse_opt(&clo_I1_cache, arg, tmp_str);
+      parse_opt(&clo_I1_cache, arg, tmp_str, 'i');
    else if VG_STR_CLO(arg, "--D1", tmp_str)
-      parse_opt(&clo_D1_cache, arg, tmp_str);
-   else if VG_STR_CLO(arg, "--L2", tmp_str)
-      parse_opt(&clo_L2_cache, arg, tmp_str);
+      parse_opt(&clo_D1_cache, arg, tmp_str, '1');
+   else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
+            VG_STR_CLO(arg, "--LL", tmp_str))
+      parse_opt(&clo_LL_cache, arg, tmp_str, '2');
   else
     return False;
 
@@ -1613,8 +1615,8 @@
 void cachesim_printstat(Int l1, Int l2, Int l3)
 {
   FullCost total = CLG_(total_cost), D_total = 0;
-  ULong L2_total_m, L2_total_mr, L2_total_mw,
-    L2_total, L2_total_r, L2_total_w;
+  ULong LL_total_m, LL_total_mr, LL_total_mw,
+    LL_total, LL_total_r, LL_total_w;
   char buf1[RESULTS_BUF_LEN], 
     buf2[RESULTS_BUF_LEN], 
     buf3[RESULTS_BUF_LEN];
@@ -1632,7 +1634,7 @@
   VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
 
   commify(total[fullOffset(EG_IR) +2], l1, buf1);
-  VG_(message)(Vg_UserMsg, "L2i misses:    %s\n", buf1);
+  VG_(message)(Vg_UserMsg, "LLi misses:    %s\n", buf1);
 
   p = 100;
 
@@ -1645,7 +1647,7 @@
        
   percentify(total[fullOffset(EG_IR)+2] * 100 * p /
 	     total[fullOffset(EG_IR)], p, l1+1, buf1);
-  VG_(message)(Vg_UserMsg, "L2i miss rate: %s\n", buf1);
+  VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
   VG_(message)(Vg_UserMsg, "\n");
    
   /* D cache results.
@@ -1673,7 +1675,7 @@
   commify( D_total[2], l1, buf1);
   commify(total[fullOffset(EG_DR)+2], l2, buf2);
   commify(total[fullOffset(EG_DW)+2], l3, buf3);
-  VG_(message)(Vg_UserMsg, "L2d misses:    %s  (%s rd + %s wr)\n",
+  VG_(message)(Vg_UserMsg, "LLd misses:    %s  (%s rd + %s wr)\n",
 	       buf1, buf2, buf3);
 
   p = 10;
@@ -1695,50 +1697,50 @@
 	     total[fullOffset(EG_DR)], p, l2+1, buf2);
   percentify(total[fullOffset(EG_DW)+2] * 100 * p /
 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
-  VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )\n", 
+  VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s   + %s  )\n", 
                buf1, buf2,buf3);
   VG_(message)(Vg_UserMsg, "\n");
 
 
   
-  /* L2 overall results */
+  /* LL overall results */
   
-  L2_total   =
+  LL_total   =
     total[fullOffset(EG_DR) +1] +
     total[fullOffset(EG_DW) +1] +
     total[fullOffset(EG_IR) +1];
-  L2_total_r =
+  LL_total_r =
     total[fullOffset(EG_DR) +1] +
     total[fullOffset(EG_IR) +1];
-  L2_total_w = total[fullOffset(EG_DW) +1];
-  commify(L2_total,   l1, buf1);
-  commify(L2_total_r, l2, buf2);
-  commify(L2_total_w, l3, buf3);
-  VG_(message)(Vg_UserMsg, "L2 refs:       %s  (%s rd + %s wr)\n",
+  LL_total_w = total[fullOffset(EG_DW) +1];
+  commify(LL_total,   l1, buf1);
+  commify(LL_total_r, l2, buf2);
+  commify(LL_total_w, l3, buf3);
+  VG_(message)(Vg_UserMsg, "LL refs:       %s  (%s rd + %s wr)\n",
 	       buf1, buf2, buf3);
   
-  L2_total_m  =
+  LL_total_m  =
     total[fullOffset(EG_DR) +2] +
     total[fullOffset(EG_DW) +2] +
     total[fullOffset(EG_IR) +2];
-  L2_total_mr =
+  LL_total_mr =
     total[fullOffset(EG_DR) +2] +
     total[fullOffset(EG_IR) +2];
-  L2_total_mw = total[fullOffset(EG_DW) +2];
-  commify(L2_total_m,  l1, buf1);
-  commify(L2_total_mr, l2, buf2);
-  commify(L2_total_mw, l3, buf3);
-  VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)\n",
+  LL_total_mw = total[fullOffset(EG_DW) +2];
+  commify(LL_total_m,  l1, buf1);
+  commify(LL_total_mr, l2, buf2);
+  commify(LL_total_mw, l3, buf3);
+  VG_(message)(Vg_UserMsg, "LL misses:     %s  (%s rd + %s wr)\n",
 	       buf1, buf2, buf3);
   
-  percentify(L2_total_m  * 100 * p /
+  percentify(LL_total_m  * 100 * p /
 	     (total[fullOffset(EG_IR)] + D_total[0]),  p, l1+1, buf1);
-  percentify(L2_total_mr * 100 * p /
+  percentify(LL_total_mr * 100 * p /
 	     (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
 	     p, l2+1, buf2);
-  percentify(L2_total_mw * 100 * p /
+  percentify(LL_total_mw * 100 * p /
 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
-  VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )\n",
+  VG_(message)(Vg_UserMsg, "LL miss rate:  %s (%s   + %s  )\n",
 	       buf1, buf2,buf3);
 }
 
@@ -1760,14 +1762,14 @@
     if (!CLG_(clo).simulate_cache)
 	CLG_(register_event_group)(EG_IR, "Ir");
     else if (!clo_simulate_writeback) {
-	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "I2mr");
-	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "D2mr");
-	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "D2mw");
+	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
+	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
+	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
     }
     else { // clo_simulate_writeback
-	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "I2mr", "I2dmr");
-        CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "D2mr", "D2dmr");
-        CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "D2dmw");
+	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
+        CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
+        CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
     }
 
     if (CLG_(clo).simulate_branch) {
@@ -1807,12 +1809,12 @@
     CLG_(append_event)(CLG_(dumpmap), "I1mr");
     CLG_(append_event)(CLG_(dumpmap), "D1mr");
     CLG_(append_event)(CLG_(dumpmap), "D1mw");
-    CLG_(append_event)(CLG_(dumpmap), "I2mr");
-    CLG_(append_event)(CLG_(dumpmap), "D2mr");
-    CLG_(append_event)(CLG_(dumpmap), "D2mw");
-    CLG_(append_event)(CLG_(dumpmap), "I2dmr");
-    CLG_(append_event)(CLG_(dumpmap), "D2dmr");
-    CLG_(append_event)(CLG_(dumpmap), "D2dmw");
+    CLG_(append_event)(CLG_(dumpmap), "ILmr");
+    CLG_(append_event)(CLG_(dumpmap), "DLmr");
+    CLG_(append_event)(CLG_(dumpmap), "DLmw");
+    CLG_(append_event)(CLG_(dumpmap), "ILdmr");
+    CLG_(append_event)(CLG_(dumpmap), "DLdmr");
+    CLG_(append_event)(CLG_(dumpmap), "DLdmw");
     CLG_(append_event)(CLG_(dumpmap), "Bc");
     CLG_(append_event)(CLG_(dumpmap), "Bcm");
     CLG_(append_event)(CLG_(dumpmap), "Bi");
diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr
index d2d7544..26bc3c0 100755
--- a/callgrind/tests/filter_stderr
+++ b/callgrind/tests/filter_stderr
@@ -13,11 +13,11 @@
 # Remove numbers from "Collected" line
 sed "s/^\(Collected *:\)[ 0-9]*$/\1/" |
 
-# Remove numbers from I/D/L2 "refs:" lines
-perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/'  |
+# Remove numbers from I/D/LL "refs:" lines
+perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/'  |
 
-# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
-perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
+# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
+perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
 
 # Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines
 perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |
diff --git a/callgrind/tests/notpower2-hwpref.stderr.exp b/callgrind/tests/notpower2-hwpref.stderr.exp
index 0705c1c..974550a 100644
--- a/callgrind/tests/notpower2-hwpref.stderr.exp
+++ b/callgrind/tests/notpower2-hwpref.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/notpower2-hwpref.vgtest b/callgrind/tests/notpower2-hwpref.vgtest
index 9da7dce..1be3b13 100644
--- a/callgrind/tests/notpower2-hwpref.vgtest
+++ b/callgrind/tests/notpower2-hwpref.vgtest
@@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-hwpref=yes
 cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-use.stderr.exp b/callgrind/tests/notpower2-use.stderr.exp
index ea9acc8..6d41645 100644
--- a/callgrind/tests/notpower2-use.stderr.exp
+++ b/callgrind/tests/notpower2-use.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/notpower2-use.vgtest b/callgrind/tests/notpower2-use.vgtest
index b8312a7..23cec4a 100644
--- a/callgrind/tests/notpower2-use.vgtest
+++ b/callgrind/tests/notpower2-use.vgtest
@@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --cacheuse=yes
 cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-wb.stderr.exp b/callgrind/tests/notpower2-wb.stderr.exp
index 90da3e4..461ac96 100644
--- a/callgrind/tests/notpower2-wb.stderr.exp
+++ b/callgrind/tests/notpower2-wb.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/notpower2-wb.vgtest b/callgrind/tests/notpower2-wb.vgtest
index 34a1f6b..6cd016f 100644
--- a/callgrind/tests/notpower2-wb.vgtest
+++ b/callgrind/tests/notpower2-wb.vgtest
@@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-wb=yes
 cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2.stderr.exp b/callgrind/tests/notpower2.stderr.exp
index 0705c1c..974550a 100644
--- a/callgrind/tests/notpower2.stderr.exp
+++ b/callgrind/tests/notpower2.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/notpower2.vgtest b/callgrind/tests/notpower2.vgtest
index 73823d7..83b9946 100644
--- a/callgrind/tests/notpower2.vgtest
+++ b/callgrind/tests/notpower2.vgtest
@@ -1,3 +1,3 @@
 prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
 cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/simwork-both.stderr.exp b/callgrind/tests/simwork-both.stderr.exp
index b742c21..f8fb402 100644
--- a/callgrind/tests/simwork-both.stderr.exp
+++ b/callgrind/tests/simwork-both.stderr.exp
@@ -1,23 +1,23 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw Bc Bcm Bi Bim
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
 
 Branches:
 Mispredicts:
diff --git a/callgrind/tests/simwork-cache.stderr.exp b/callgrind/tests/simwork-cache.stderr.exp
index 0705c1c..974550a 100644
--- a/callgrind/tests/simwork-cache.stderr.exp
+++ b/callgrind/tests/simwork-cache.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/simwork1.stderr.exp b/callgrind/tests/simwork1.stderr.exp
index 0705c1c..974550a 100644
--- a/callgrind/tests/simwork1.stderr.exp
+++ b/callgrind/tests/simwork1.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/simwork2.stderr.exp b/callgrind/tests/simwork2.stderr.exp
index 90da3e4..461ac96 100644
--- a/callgrind/tests/simwork2.stderr.exp
+++ b/callgrind/tests/simwork2.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/simwork3.stderr.exp b/callgrind/tests/simwork3.stderr.exp
index ea9acc8..6d41645 100644
--- a/callgrind/tests/simwork3.stderr.exp
+++ b/callgrind/tests/simwork3.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/threads-use.stderr.exp b/callgrind/tests/threads-use.stderr.exp
index 4f0bb99..c8fd75e 100644
--- a/callgrind/tests/threads-use.stderr.exp
+++ b/callgrind/tests/threads-use.stderr.exp
@@ -1,20 +1,20 @@
 
 
-Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
+Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
 Collected :
 
 I   refs:
 I1  misses:
-L2i misses:
+LLi misses:
 I1  miss rate:
-L2i miss rate:
+LLi miss rate:
 
 D   refs:
 D1  misses:
-L2d misses:
+LLd misses:
 D1  miss rate:
-L2d miss rate:
+LLd miss rate:
 
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate: