nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 1 | |
| 2 | /*--------------------------------------------------------------------*/ |
njn | 8b68b64 | 2009-06-24 00:37:09 +0000 | [diff] [blame] | 3 | /*--- x86- and AMD64-specific definitions. cg-x86-amd64.c ---*/ |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 4 | /*--------------------------------------------------------------------*/ |
| 5 | |
| 6 | /* |
| 7 | This file is part of Cachegrind, a Valgrind tool for cache |
| 8 | profiling programs. |
| 9 | |
sewardj | 03f8d3f | 2012-08-05 15:46:46 +0000 | [diff] [blame] | 10 | Copyright (C) 2002-2012 Nicholas Nethercote |
njn | 2bc1012 | 2005-05-08 02:10:27 +0000 | [diff] [blame] | 11 | njn@valgrind.org |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 12 | |
| 13 | This program is free software; you can redistribute it and/or |
| 14 | modify it under the terms of the GNU General Public License as |
| 15 | published by the Free Software Foundation; either version 2 of the |
| 16 | License, or (at your option) any later version. |
| 17 | |
| 18 | This program is distributed in the hope that it will be useful, but |
| 19 | WITHOUT ANY WARRANTY; without even the implied warranty of |
| 20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 21 | General Public License for more details. |
| 22 | |
| 23 | You should have received a copy of the GNU General Public License |
| 24 | along with this program; if not, write to the Free Software |
| 25 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
| 26 | 02111-1307, USA. |
| 27 | |
| 28 | The GNU General Public License is contained in the file COPYING. |
| 29 | */ |
| 30 | |
njn | 8b68b64 | 2009-06-24 00:37:09 +0000 | [diff] [blame] | 31 | #if defined(VGA_x86) || defined(VGA_amd64) |
| 32 | |
njn | c7561b9 | 2005-06-19 01:24:32 +0000 | [diff] [blame] | 33 | #include "pub_tool_basics.h" |
njn | 6898086 | 2005-06-18 18:31:26 +0000 | [diff] [blame] | 34 | #include "pub_tool_cpuid.h" |
njn | 97405b2 | 2005-06-02 03:39:33 +0000 | [diff] [blame] | 35 | #include "pub_tool_libcbase.h" |
njn | f39e9a3 | 2005-06-12 02:43:17 +0000 | [diff] [blame] | 36 | #include "pub_tool_libcassert.h" |
njn | 36a20fa | 2005-06-03 03:08:39 +0000 | [diff] [blame] | 37 | #include "pub_tool_libcprint.h" |
njn | c7561b9 | 2005-06-19 01:24:32 +0000 | [diff] [blame] | 38 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 39 | #include "cg_arch.h" |
| 40 | |
sewardj | f91b0a3 | 2009-08-28 22:34:09 +0000 | [diff] [blame] | 41 | // All CPUID info taken from sandpile.org/ia32/cpuid.htm */ |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 42 | // Probably only works for Intel and AMD chips, and probably only for some of |
| 43 | // them. |
| 44 | |
| 45 | static void micro_ops_warn(Int actual_size, Int used_size, Int line_size) |
| 46 | { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 47 | VG_(dmsg)("warning: Pentium 4 with %d KB micro-op instruction trace cache\n", |
| 48 | actual_size); |
| 49 | VG_(dmsg)(" Simulating a %d KB I-cache with %d B lines\n", |
| 50 | used_size, line_size); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 51 | } |
| 52 | |
| 53 | /* Intel method is truly wretched. We have to do an insane indexing into an |
| 54 | * array of pre-defined configurations for various parts of the memory |
weidendo | 1c3e3c5 | 2006-11-23 13:04:30 +0000 | [diff] [blame] | 55 | * hierarchy. |
| 56 | * According to Intel Processor Identification, App Note 485. |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 57 | * |
| 58 | * If a L3 cache is found, then data for it rather than the L2 |
| 59 | * is returned via *LLc. |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 60 | */ |
| 61 | static |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 62 | Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 63 | { |
weidendo | 1c3e3c5 | 2006-11-23 13:04:30 +0000 | [diff] [blame] | 64 | Int cpuid1_eax; |
| 65 | Int cpuid1_ignore; |
| 66 | Int family; |
| 67 | Int model; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 68 | UChar info[16]; |
tom | ad8a591 | 2011-06-10 15:04:22 +0000 | [diff] [blame] | 69 | Int i, j, trials; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 70 | Bool L2_found = False; |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 71 | /* If we see L3 cache info, copy it into L3c. Then, at the end, |
| 72 | copy it into *LLc. Hence if a L3 cache is specified, *LLc will |
| 73 | eventually contain a description of it rather than the L2 cache. |
| 74 | The use of the L3c intermediary makes this process independent |
| 75 | of the order in which the cache specifications appear in |
| 76 | info[]. */ |
| 77 | Bool L3_found = False; |
| 78 | cache_t L3c = { 0, 0, 0 }; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 79 | |
| 80 | if (level < 2) { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 81 | VG_(dmsg)("warning: CPUID level < 2 for Intel processor (%d)\n", level); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 82 | return -1; |
| 83 | } |
| 84 | |
weidendo | 1c3e3c5 | 2006-11-23 13:04:30 +0000 | [diff] [blame] | 85 | /* family/model needed to distinguish code reuse (currently 0x49) */ |
tom | ad8a591 | 2011-06-10 15:04:22 +0000 | [diff] [blame] | 86 | VG_(cpuid)(1, 0, &cpuid1_eax, &cpuid1_ignore, |
weidendo | 1c3e3c5 | 2006-11-23 13:04:30 +0000 | [diff] [blame] | 87 | &cpuid1_ignore, &cpuid1_ignore); |
| 88 | family = (((cpuid1_eax >> 20) & 0xff) << 4) + ((cpuid1_eax >> 8) & 0xf); |
| 89 | model = (((cpuid1_eax >> 16) & 0xf) << 4) + ((cpuid1_eax >> 4) & 0xf); |
| 90 | |
tom | ad8a591 | 2011-06-10 15:04:22 +0000 | [diff] [blame] | 91 | VG_(cpuid)(2, 0, (Int*)&info[0], (Int*)&info[4], |
| 92 | (Int*)&info[8], (Int*)&info[12]); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 93 | trials = info[0] - 1; /* AL register - bits 0..7 of %eax */ |
| 94 | info[0] = 0x0; /* reset AL */ |
| 95 | |
| 96 | if (0 != trials) { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 97 | VG_(dmsg)("warning: non-zero CPUID trials for Intel processor (%d)\n", |
| 98 | trials); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 99 | return -1; |
| 100 | } |
| 101 | |
| 102 | for (i = 0; i < 16; i++) { |
| 103 | |
| 104 | switch (info[i]) { |
| 105 | |
| 106 | case 0x0: /* ignore zeros */ |
| 107 | break; |
| 108 | |
| 109 | /* TLB info, ignore */ |
weidendo | 966b5bd | 2006-10-12 14:23:38 +0000 | [diff] [blame] | 110 | case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: |
weidendo | ca7cf38 | 2011-06-10 20:29:27 +0000 | [diff] [blame] | 111 | case 0x0b: |
tom | 55b3a81 | 2009-10-28 09:21:53 +0000 | [diff] [blame] | 112 | case 0x4f: case 0x50: case 0x51: case 0x52: case 0x55: |
tom | 1e76ff5 | 2009-01-02 11:07:18 +0000 | [diff] [blame] | 113 | case 0x56: case 0x57: case 0x59: |
tom | 55b3a81 | 2009-10-28 09:21:53 +0000 | [diff] [blame] | 114 | case 0x5a: case 0x5b: case 0x5c: case 0x5d: |
weidendo | ca7cf38 | 2011-06-10 20:29:27 +0000 | [diff] [blame] | 115 | case 0x76: |
tom | 55b3a81 | 2009-10-28 09:21:53 +0000 | [diff] [blame] | 116 | case 0xb0: case 0xb1: case 0xb2: |
tom | 1e76ff5 | 2009-01-02 11:07:18 +0000 | [diff] [blame] | 117 | case 0xb3: case 0xb4: case 0xba: case 0xc0: |
tom | 55b3a81 | 2009-10-28 09:21:53 +0000 | [diff] [blame] | 118 | case 0xca: |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 119 | break; |
| 120 | |
| 121 | case 0x06: *I1c = (cache_t) { 8, 4, 32 }; break; |
| 122 | case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break; |
tom | 55b3a81 | 2009-10-28 09:21:53 +0000 | [diff] [blame] | 123 | case 0x09: *I1c = (cache_t) { 32, 4, 64 }; break; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 124 | case 0x30: *I1c = (cache_t) { 32, 8, 64 }; break; |
| 125 | |
| 126 | case 0x0a: *D1c = (cache_t) { 8, 2, 32 }; break; |
| 127 | case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break; |
weidendo | ca7cf38 | 2011-06-10 20:29:27 +0000 | [diff] [blame] | 128 | case 0x0d: *D1c = (cache_t) { 16, 4, 64 }; break; |
weidendo | 144b76c | 2009-01-26 22:56:14 +0000 | [diff] [blame] | 129 | case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 130 | case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break; |
| 131 | |
| 132 | /* IA-64 info -- panic! */ |
| 133 | case 0x10: case 0x15: case 0x1a: |
| 134 | case 0x88: case 0x89: case 0x8a: case 0x8d: |
| 135 | case 0x90: case 0x96: case 0x9b: |
njn | 6799325 | 2004-11-22 18:02:32 +0000 | [diff] [blame] | 136 | VG_(tool_panic)("IA-64 cache detected?!"); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 137 | |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 138 | /* L3 cache info. */ |
| 139 | case 0x22: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break; |
| 140 | case 0x23: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break; |
| 141 | case 0x25: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break; |
| 142 | case 0x29: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break; |
| 143 | case 0x46: L3c = (cache_t) { 4096, 4, 64 }; L3_found = True; break; |
| 144 | case 0x47: L3c = (cache_t) { 8192, 8, 64 }; L3_found = True; break; |
| 145 | case 0x4a: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break; |
| 146 | case 0x4b: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break; |
| 147 | case 0x4c: L3c = (cache_t) { 12288, 12, 64 }; L3_found = True; break; |
| 148 | case 0x4d: L3c = (cache_t) { 16384, 16, 64 }; L3_found = True; break; |
| 149 | case 0xd0: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break; |
| 150 | case 0xd1: L3c = (cache_t) { 1024, 4, 64 }; L3_found = True; break; |
| 151 | case 0xd2: L3c = (cache_t) { 2048, 4, 64 }; L3_found = True; break; |
| 152 | case 0xd6: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break; |
| 153 | case 0xd7: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break; |
| 154 | case 0xd8: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break; |
| 155 | case 0xdc: L3c = (cache_t) { 1536, 12, 64 }; L3_found = True; break; |
| 156 | case 0xdd: L3c = (cache_t) { 3072, 12, 64 }; L3_found = True; break; |
| 157 | case 0xde: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break; |
| 158 | case 0xe2: L3c = (cache_t) { 2048, 16, 64 }; L3_found = True; break; |
| 159 | case 0xe3: L3c = (cache_t) { 4096, 16, 64 }; L3_found = True; break; |
| 160 | case 0xe4: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break; |
| 161 | case 0xea: L3c = (cache_t) { 12288, 24, 64 }; L3_found = True; break; |
| 162 | case 0xeb: L3c = (cache_t) { 18432, 24, 64 }; L3_found = True; break; |
| 163 | case 0xec: L3c = (cache_t) { 24576, 24, 64 }; L3_found = True; break; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 164 | |
tom | 55b3a81 | 2009-10-28 09:21:53 +0000 | [diff] [blame] | 165 | /* Described as "MLC" in Intel documentation */ |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 166 | case 0x21: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break; |
tom | 55b3a81 | 2009-10-28 09:21:53 +0000 | [diff] [blame] | 167 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 168 | /* These are sectored, whatever that means */ |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 169 | case 0x39: *LLc = (cache_t) { 128, 4, 64 }; L2_found = True; break; |
| 170 | case 0x3c: *LLc = (cache_t) { 256, 4, 64 }; L2_found = True; break; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 171 | |
| 172 | /* If a P6 core, this means "no L2 cache". |
| 173 | If a P4 core, this means "no L3 cache". |
| 174 | We don't know what core it is, so don't issue a warning. To detect |
| 175 | a missing L2 cache, we use 'L2_found'. */ |
| 176 | case 0x40: |
| 177 | break; |
| 178 | |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 179 | case 0x41: *LLc = (cache_t) { 128, 4, 32 }; L2_found = True; break; |
| 180 | case 0x42: *LLc = (cache_t) { 256, 4, 32 }; L2_found = True; break; |
| 181 | case 0x43: *LLc = (cache_t) { 512, 4, 32 }; L2_found = True; break; |
| 182 | case 0x44: *LLc = (cache_t) { 1024, 4, 32 }; L2_found = True; break; |
| 183 | case 0x45: *LLc = (cache_t) { 2048, 4, 32 }; L2_found = True; break; |
| 184 | case 0x48: *LLc = (cache_t) { 3072, 12, 64 }; L2_found = True; break; |
| 185 | case 0x4e: *LLc = (cache_t) { 6144, 24, 64 }; L2_found = True; break; |
weidendo | 1c3e3c5 | 2006-11-23 13:04:30 +0000 | [diff] [blame] | 186 | case 0x49: |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 187 | if (family == 15 && model == 6) { |
| 188 | /* On Xeon MP (family F, model 6), this is for L3 */ |
| 189 | L3c = (cache_t) { 4096, 16, 64 }; L3_found = True; |
| 190 | } else { |
| 191 | *LLc = (cache_t) { 4096, 16, 64 }; L2_found = True; |
| 192 | } |
| 193 | break; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 194 | |
| 195 | /* These are sectored, whatever that means */ |
nethercote | ac7ecd7 | 2004-10-13 11:30:14 +0000 | [diff] [blame] | 196 | case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */ |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 197 | case 0x66: *D1c = (cache_t) { 8, 4, 64 }; break; /* sectored */ |
| 198 | case 0x67: *D1c = (cache_t) { 16, 4, 64 }; break; /* sectored */ |
| 199 | case 0x68: *D1c = (cache_t) { 32, 4, 64 }; break; /* sectored */ |
| 200 | |
| 201 | /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based. |
| 202 | * conversion to byte size is a total guess; treat the 12K and 16K |
| 203 | * cases the same since the cache byte size must be a power of two for |
| 204 | * everything to work!. Also guessing 32 bytes for the line size... |
| 205 | */ |
| 206 | case 0x70: /* 12K micro-ops, 8-way */ |
| 207 | *I1c = (cache_t) { 16, 8, 32 }; |
| 208 | micro_ops_warn(12, 16, 32); |
| 209 | break; |
| 210 | case 0x71: /* 16K micro-ops, 8-way */ |
| 211 | *I1c = (cache_t) { 16, 8, 32 }; |
| 212 | micro_ops_warn(16, 16, 32); |
| 213 | break; |
| 214 | case 0x72: /* 32K micro-ops, 8-way */ |
| 215 | *I1c = (cache_t) { 32, 8, 32 }; |
| 216 | micro_ops_warn(32, 32, 32); |
| 217 | break; |
| 218 | |
sewardj | f91b0a3 | 2009-08-28 22:34:09 +0000 | [diff] [blame] | 219 | /* not sectored, whatever that might mean */ |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 220 | case 0x78: *LLc = (cache_t) { 1024, 4, 64 }; L2_found = True; break; |
sewardj | f91b0a3 | 2009-08-28 22:34:09 +0000 | [diff] [blame] | 221 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 222 | /* These are sectored, whatever that means */ |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 223 | case 0x79: *LLc = (cache_t) { 128, 8, 64 }; L2_found = True; break; |
| 224 | case 0x7a: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break; |
| 225 | case 0x7b: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break; |
| 226 | case 0x7c: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break; |
| 227 | case 0x7d: *LLc = (cache_t) { 2048, 8, 64 }; L2_found = True; break; |
| 228 | case 0x7e: *LLc = (cache_t) { 256, 8, 128 }; L2_found = True; break; |
| 229 | case 0x7f: *LLc = (cache_t) { 512, 2, 64 }; L2_found = True; break; |
| 230 | case 0x80: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break; |
| 231 | case 0x81: *LLc = (cache_t) { 128, 8, 32 }; L2_found = True; break; |
| 232 | case 0x82: *LLc = (cache_t) { 256, 8, 32 }; L2_found = True; break; |
| 233 | case 0x83: *LLc = (cache_t) { 512, 8, 32 }; L2_found = True; break; |
| 234 | case 0x84: *LLc = (cache_t) { 1024, 8, 32 }; L2_found = True; break; |
| 235 | case 0x85: *LLc = (cache_t) { 2048, 8, 32 }; L2_found = True; break; |
| 236 | case 0x86: *LLc = (cache_t) { 512, 4, 64 }; L2_found = True; break; |
| 237 | case 0x87: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 238 | |
tom | 942d9ef | 2005-07-27 22:59:50 +0000 | [diff] [blame] | 239 | /* Ignore prefetch information */ |
| 240 | case 0xf0: case 0xf1: |
njn | 6f74a7e | 2009-03-12 00:06:45 +0000 | [diff] [blame] | 241 | break; |
tom | 942d9ef | 2005-07-27 22:59:50 +0000 | [diff] [blame] | 242 | |
tom | ad8a591 | 2011-06-10 15:04:22 +0000 | [diff] [blame] | 243 | case 0xff: |
| 244 | j = 0; |
| 245 | VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4], |
| 246 | (Int*)&info[8], (Int*)&info[12]); |
| 247 | |
| 248 | while ((info[0] & 0x1f) != 0) { |
| 249 | UInt assoc = ((*(UInt *)&info[4] >> 22) & 0x3ff) + 1; |
| 250 | UInt parts = ((*(UInt *)&info[4] >> 12) & 0x3ff) + 1; |
| 251 | UInt line_size = (*(UInt *)&info[4] & 0x7ff) + 1; |
| 252 | UInt sets = *(UInt *)&info[8] + 1; |
| 253 | cache_t c; |
| 254 | |
| 255 | c.size = assoc * parts * line_size * sets / 1024; |
| 256 | c.assoc = assoc; |
| 257 | c.line_size = line_size; |
| 258 | |
| 259 | switch ((info[0] & 0xe0) >> 5) |
| 260 | { |
| 261 | case 1: |
| 262 | switch (info[0] & 0x1f) |
| 263 | { |
| 264 | case 1: *D1c = c; break; |
| 265 | case 2: *I1c = c; break; |
| 266 | case 3: VG_(dmsg)("warning: L1 unified cache ignored\n"); break; |
| 267 | default: VG_(dmsg)("warning: L1 cache of unknown type ignored\n"); break; |
| 268 | } |
| 269 | break; |
| 270 | case 2: |
| 271 | switch (info[0] & 0x1f) |
| 272 | { |
| 273 | case 1: VG_(dmsg)("warning: L2 data cache ignored\n"); break; |
| 274 | case 2: VG_(dmsg)("warning: L2 instruction cache ignored\n"); break; |
| 275 | case 3: *LLc = c; L2_found = True; break; |
| 276 | default: VG_(dmsg)("warning: L2 cache of unknown type ignored\n"); break; |
| 277 | } |
| 278 | break; |
| 279 | case 3: |
| 280 | switch (info[0] & 0x1f) |
| 281 | { |
| 282 | case 1: VG_(dmsg)("warning: L3 data cache ignored\n"); break; |
| 283 | case 2: VG_(dmsg)("warning: L3 instruction cache ignored\n"); break; |
| 284 | case 3: L3c = c; L3_found = True; break; |
| 285 | default: VG_(dmsg)("warning: L3 cache of unknown type ignored\n"); break; |
| 286 | } |
| 287 | break; |
| 288 | default: |
| 289 | VG_(dmsg)("warning: L%u cache ignored\n", (info[0] & 0xe0) >> 5); |
| 290 | break; |
| 291 | } |
| 292 | |
| 293 | VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4], |
| 294 | (Int*)&info[8], (Int*)&info[12]); |
| 295 | } |
| 296 | break; |
| 297 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 298 | default: |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 299 | VG_(dmsg)("warning: Unknown Intel cache config value (0x%x), ignoring\n", |
| 300 | info[i]); |
njn | 6f74a7e | 2009-03-12 00:06:45 +0000 | [diff] [blame] | 301 | break; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 302 | } |
| 303 | } |
| 304 | |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 305 | /* If we found a L3 cache, throw away the L2 data and use the L3's instead. */ |
| 306 | if (L3_found) { |
| 307 | VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n"); |
| 308 | *LLc = L3c; |
| 309 | L2_found = True; |
| 310 | } |
| 311 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 312 | if (!L2_found) |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 313 | VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n"); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 314 | |
| 315 | return 0; |
| 316 | } |
| 317 | |
| 318 | /* AMD method is straightforward, just extract appropriate bits from the |
| 319 | * result registers. |
| 320 | * |
| 321 | * Bits, for D1 and I1: |
| 322 | * 31..24 data L1 cache size in KBs |
| 323 | * 23..16 data L1 cache associativity (FFh=full) |
| 324 | * 15.. 8 data L1 cache lines per tag |
| 325 | * 7.. 0 data L1 cache line size in bytes |
| 326 | * |
| 327 | * Bits, for L2: |
| 328 | * 31..16 unified L2 cache size in KBs |
| 329 | * 15..12 unified L2 cache associativity (0=off, FFh=full) |
| 330 | * 11.. 8 unified L2 cache lines per tag |
| 331 | * 7.. 0 unified L2 cache line size in bytes |
| 332 | * |
| 333 | * #3 The AMD K7 processor's L2 cache must be configured prior to relying |
| 334 | * upon this information. (Whatever that means -- njn) |
| 335 | * |
| 336 | * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model |
| 337 | * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB), |
| 338 | * so we detect that. |
| 339 | * |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 340 | * Returns 0 on success, non-zero on failure. As with the Intel code |
| 341 | * above, if a L3 cache is found, then data for it rather than the L2 |
| 342 | * is returned via *LLc. |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 343 | */ |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 344 | |
| 345 | /* A small helper */ |
| 346 | static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 ) |
| 347 | { |
| 348 | /* Decode a L2/L3 associativity indication. It is encoded |
| 349 | differently from the I1/D1 associativity. Returns 1 |
| 350 | (direct-map) as a safe but suboptimal result for unknown |
| 351 | encodings. */ |
| 352 | switch (bits_15_12 & 0xF) { |
| 353 | case 1: return 1; case 2: return 2; |
| 354 | case 4: return 4; case 6: return 8; |
| 355 | case 8: return 16; case 0xA: return 32; |
| 356 | case 0xB: return 48; case 0xC: return 64; |
| 357 | case 0xD: return 96; case 0xE: return 128; |
| 358 | case 0xF: /* fully associative */ |
| 359 | case 0: /* L2/L3 cache or TLB is disabled */ |
| 360 | default: |
| 361 | return 1; |
| 362 | } |
| 363 | } |
| 364 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 365 | static |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 366 | Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc) |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 367 | { |
| 368 | UInt ext_level; |
| 369 | UInt dummy, model; |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 370 | UInt I1i, D1i, L2i, L3i; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 371 | |
tom | ad8a591 | 2011-06-10 15:04:22 +0000 | [diff] [blame] | 372 | VG_(cpuid)(0x80000000, 0, &ext_level, &dummy, &dummy, &dummy); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 373 | |
| 374 | if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 375 | VG_(dmsg)("warning: ext_level < 0x80000006 for AMD processor (0x%x)\n", |
| 376 | ext_level); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 377 | return -1; |
| 378 | } |
| 379 | |
tom | ad8a591 | 2011-06-10 15:04:22 +0000 | [diff] [blame] | 380 | VG_(cpuid)(0x80000005, 0, &dummy, &dummy, &D1i, &I1i); |
| 381 | VG_(cpuid)(0x80000006, 0, &dummy, &dummy, &L2i, &L3i); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 382 | |
tom | ad8a591 | 2011-06-10 15:04:22 +0000 | [diff] [blame] | 383 | VG_(cpuid)(0x1, 0, &model, &dummy, &dummy, &dummy); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 384 | |
| 385 | /* Check for Duron bug */ |
| 386 | if (model == 0x630) { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 387 | VG_(dmsg)("warning: Buggy Duron stepping A0. Assuming L2 size=65536 bytes\n"); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 388 | L2i = (64 << 16) | (L2i & 0xffff); |
| 389 | } |
| 390 | |
| 391 | D1c->size = (D1i >> 24) & 0xff; |
| 392 | D1c->assoc = (D1i >> 16) & 0xff; |
| 393 | D1c->line_size = (D1i >> 0) & 0xff; |
| 394 | |
| 395 | I1c->size = (I1i >> 24) & 0xff; |
| 396 | I1c->assoc = (I1i >> 16) & 0xff; |
| 397 | I1c->line_size = (I1i >> 0) & 0xff; |
| 398 | |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 399 | LLc->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */ |
| 400 | LLc->assoc = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf); |
| 401 | LLc->line_size = (L2i >> 0) & 0xff; |
| 402 | |
| 403 | if (((L3i >> 18) & 0x3fff) > 0) { |
| 404 | /* There's an L3 cache. Replace *LLc contents with this info. */ |
| 405 | /* NB: the test in the if is "if L3 size > 0 ". I don't know if |
| 406 | this is the right way to test presence-vs-absence of L3. I |
| 407 | can't see any guidance on this in the AMD documentation. */ |
| 408 | LLc->size = ((L3i >> 18) & 0x3fff) * 512; |
| 409 | LLc->assoc = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf); |
| 410 | LLc->line_size = (L3i >> 0) & 0xff; |
| 411 | VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n"); |
| 412 | } |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 413 | |
| 414 | return 0; |
| 415 | } |
| 416 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 417 | static |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 418 | Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc) |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 419 | { |
sewardj | b5f6f51 | 2005-03-10 23:59:00 +0000 | [diff] [blame] | 420 | Int level, ret; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 421 | Char vendor_id[13]; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 422 | |
sewardj | b5f6f51 | 2005-03-10 23:59:00 +0000 | [diff] [blame] | 423 | if (!VG_(has_cpuid)()) { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 424 | VG_(dmsg)("CPUID instruction not supported\n"); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 425 | return -1; |
| 426 | } |
tom | f4ed059 | 2005-04-02 17:30:19 +0000 | [diff] [blame] | 427 | |
tom | ad8a591 | 2011-06-10 15:04:22 +0000 | [diff] [blame] | 428 | VG_(cpuid)(0, 0, &level, (int*)&vendor_id[0], |
sewardj | b5f6f51 | 2005-03-10 23:59:00 +0000 | [diff] [blame] | 429 | (int*)&vendor_id[8], (int*)&vendor_id[4]); |
| 430 | vendor_id[12] = '\0'; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 431 | |
| 432 | if (0 == level) { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 433 | VG_(dmsg)("CPUID level is 0, early Pentium?\n"); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 434 | return -1; |
| 435 | } |
| 436 | |
| 437 | /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */ |
| 438 | if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) { |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 439 | ret = Intel_cache_info(level, I1c, D1c, LLc); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 440 | |
| 441 | } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) { |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 442 | ret = AMD_cache_info(I1c, D1c, LLc); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 443 | |
| 444 | } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) { |
| 445 | /* Total kludge. Pretend to be a VIA Nehemiah. */ |
| 446 | D1c->size = 64; |
| 447 | D1c->assoc = 16; |
| 448 | D1c->line_size = 16; |
| 449 | I1c->size = 64; |
| 450 | I1c->assoc = 4; |
| 451 | I1c->line_size = 16; |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 452 | LLc->size = 64; |
| 453 | LLc->assoc = 16; |
| 454 | LLc->line_size = 16; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 455 | ret = 0; |
| 456 | |
| 457 | } else { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 458 | VG_(dmsg)("CPU vendor ID not recognised (%s)\n", vendor_id); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 459 | return -1; |
| 460 | } |
| 461 | |
| 462 | /* Successful! Convert sizes from KB to bytes */ |
| 463 | I1c->size *= 1024; |
| 464 | D1c->size *= 1024; |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 465 | LLc->size *= 1024; |
weidendo | ca7cf38 | 2011-06-10 20:29:27 +0000 | [diff] [blame] | 466 | |
sewardj | aebbf1c | 2011-06-13 13:14:00 +0000 | [diff] [blame] | 467 | /* If the LL cache config isn't something the simulation functions |
| 468 | can handle, try to adjust it so it is. Caches are characterised |
| 469 | by (total size T, line size L, associativity A), and then we |
| 470 | have |
| 471 | |
| 472 | number of sets S = T / (L * A) |
| 473 | |
| 474 | The required constraints are: |
| 475 | |
| 476 | * L must be a power of 2, but it always is in practice, so |
| 477 | no problem there |
| 478 | |
| 479 | * A can be any value >= 1 |
| 480 | |
| 481 | * T can be any value, but .. |
| 482 | |
| 483 | * S must be a power of 2. |
| 484 | |
| 485 | That sometimes gives a problem. For example, some Core iX based |
| 486 | Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288 |
| 487 | sets. The "fix" in this case is to increase the associativity |
| 488 | by 50% to 24, which reduces the number of sets to 8192, making |
| 489 | it a power of 2. That's what the following code does (handing |
| 490 | the "3/2 rescaling case".) We might need to deal with other |
| 491 | ratios later (5/4 ?). |
| 492 | |
| 493 | The "fix" is "justified" (cough, cough) by alleging that |
| 494 | increases of associativity above about 4 have very little effect |
| 495 | on the actual miss rate. It would be far more inaccurate to |
| 496 | fudge this by changing the size of the simulated cache -- |
| 497 | changing the associativity is a much better option. |
| 498 | */ |
| 499 | if (LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0) { |
| 500 | Long nSets = (Long)LLc->size / (Long)(LLc->line_size * LLc->assoc); |
| 501 | if (/* stay sane */ |
| 502 | nSets >= 4 |
| 503 | /* nSets is not a power of 2 */ |
| 504 | && VG_(log2_64)( (ULong)nSets ) == -1 |
| 505 | /* nSets is 50% above a power of 2 */ |
| 506 | && VG_(log2_64)( (ULong)((2 * nSets) / (Long)3) ) != -1 |
| 507 | /* associativity can be increased by exactly 50% */ |
| 508 | && (LLc->assoc % 2) == 0 |
| 509 | ) { |
| 510 | /* # sets is 1.5 * a power of two, but the associativity is |
| 511 | even, so we can increase that up by 50% and implicitly |
| 512 | scale the # sets down accordingly. */ |
| 513 | Int new_assoc = LLc->assoc + (LLc->assoc / 2); |
| 514 | VG_(dmsg)("warning: pretending that LL cache has associativity" |
| 515 | " %d instead of actual %d\n", new_assoc, LLc->assoc); |
| 516 | LLc->assoc = new_assoc; |
| 517 | } |
| 518 | } |
| 519 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 520 | return ret; |
| 521 | } |
| 522 | |
| 523 | |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 524 | void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, |
njn | af839f5 | 2005-06-23 03:27:57 +0000 | [diff] [blame] | 525 | Bool all_caches_clo_defined) |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 526 | { |
| 527 | Int res; |
| 528 | |
| 529 | // Set caches to default. |
njn | a1d1a64 | 2004-11-26 18:36:02 +0000 | [diff] [blame] | 530 | *I1c = (cache_t) { 65536, 2, 64 }; |
| 531 | *D1c = (cache_t) { 65536, 2, 64 }; |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 532 | *LLc = (cache_t) { 262144, 8, 64 }; |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 533 | |
| 534 | // Then replace with any info we can get from CPUID. |
njn | 2d853a1 | 2010-10-06 22:46:31 +0000 | [diff] [blame] | 535 | res = get_caches_from_CPUID(I1c, D1c, LLc); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 536 | |
| 537 | // Warn if CPUID failed and config not completely specified from cmd line. |
| 538 | if (res != 0 && !all_caches_clo_defined) { |
sewardj | b2c985b | 2009-07-15 14:51:17 +0000 | [diff] [blame] | 539 | VG_(dmsg)("Warning: Couldn't auto-detect cache config, using one " |
| 540 | "or more defaults \n"); |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 541 | } |
| 542 | } |
| 543 | |
njn | 8b68b64 | 2009-06-24 00:37:09 +0000 | [diff] [blame] | 544 | #endif // defined(VGA_x86) || defined(VGA_amd64) |
| 545 | |
nethercote | b35a8b9 | 2004-09-11 16:45:27 +0000 | [diff] [blame] | 546 | /*--------------------------------------------------------------------*/ |
| 547 | /*--- end ---*/ |
| 548 | /*--------------------------------------------------------------------*/ |