blob: 9edbecc17257a8625a7d965ef7faaefa4d863917 [file] [log] [blame]
weidendoa17f2a32006-03-20 10:27:30 +00001
2/*--------------------------------------------------------------------*/
3/*--- Cache simulation. ---*/
4/*--- sim.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
njn9a0cba42007-04-15 22:15:57 +00008 This file is part of Callgrind, a Valgrind tool for call graph
9 profiling programs.
weidendoa17f2a32006-03-20 10:27:30 +000010
njn9a0cba42007-04-15 22:15:57 +000011 Copyright (C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
weidendoa17f2a32006-03-20 10:27:30 +000012
njn9a0cba42007-04-15 22:15:57 +000013 This tool is derived from and contains code from Cachegrind
njn9f207462009-03-10 22:02:09 +000014 Copyright (C) 2002-2009 Nicholas Nethercote (njn@valgrind.org)
weidendoa17f2a32006-03-20 10:27:30 +000015
16 This program is free software; you can redistribute it and/or
17 modify it under the terms of the GNU General Public License as
18 published by the Free Software Foundation; either version 2 of the
19 License, or (at your option) any later version.
20
21 This program is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received a copy of the GNU General Public License
27 along with this program; if not, write to the Free Software
28 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 02111-1307, USA.
30
31 The GNU General Public License is contained in the file COPYING.
32*/
33
34#include "global.h"
35
36
37/* Notes:
38 - simulates a write-allocate cache
39 - (block --> set) hash function uses simple bit selection
40 - handling of references straddling two cache blocks:
41 - counts as only one cache access (not two)
42 - both blocks hit --> one hit
43 - one block hits, the other misses --> one miss
44 - both blocks miss --> one miss (not two)
45*/
46
47/* Cache configuration */
48#include "cg_arch.h"
49
50/* additional structures for cache use info, separated
51 * according usage frequency:
52 * - line_loaded : pointer to cost center of instruction
53 * which loaded the line into cache.
54 * Needed to increment counters when line is evicted.
55 * - line_use : updated on every access
56 */
57typedef struct {
58 UInt count;
59 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
60} line_use;
61
62typedef struct {
63 Addr memline, iaddr;
64 line_use* dep_use; /* point to higher-level cacheblock for this memline */
65 ULong* use_base;
66} line_loaded;
67
68/* Cache state */
69typedef struct {
70 char* name;
71 int size; /* bytes */
72 int assoc;
73 int line_size; /* bytes */
74 Bool sectored; /* prefetch nearside cacheline on read */
75 int sets;
76 int sets_min_1;
weidendoa17f2a32006-03-20 10:27:30 +000077 int line_size_bits;
78 int tag_shift;
79 UWord tag_mask;
80 char desc_line[128];
81 UWord* tags;
82
83 /* for cache use */
84 int line_size_mask;
85 int* line_start_mask;
86 int* line_end_mask;
87 line_loaded* loaded;
88 line_use* use;
89} cache_t2;
90
91/*
92 * States of flat caches in our model.
93 * We use a 2-level hierarchy,
94 */
95static cache_t2 I1, D1, L2;
96
97/* Lower bits of cache tags are used as flags for a cache line */
98#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
99#define CACHELINE_DIRTY 1
100
101
102/* Cache simulator Options */
103static Bool clo_simulate_writeback = False;
104static Bool clo_simulate_hwpref = False;
105static Bool clo_simulate_sectors = False;
106static Bool clo_collect_cacheuse = False;
107
108/* Following global vars are setup before by
109 * setup_bbcc()/cachesim_after_bbsetup():
110 *
111 * - Addr bb_base (instruction start address of original BB)
112 * - ULong* cost_base (start of cost array for BB)
113 * - BBCC* nonskipped (only != 0 when in a function not skipped)
114 */
115
116/* Offset to events in event set, used in log_* functions */
117static Int off_D0_Ir;
118static Int off_D1r_Ir;
119static Int off_D1r_Dr;
120static Int off_D1w_Ir;
121static Int off_D1w_Dw;
122static Int off_D2_Ir;
123static Int off_D2_Dr;
124static Int off_D2_Dw;
125
126static Addr bb_base;
127static ULong* cost_base;
128static InstrInfo* current_ii;
129
130/* Cache use offsets */
131/* FIXME: The offsets are only correct because all eventsets get
132 * the "Use" set added first !
133 */
134static Int off_I1_AcCost = 0;
135static Int off_I1_SpLoss = 1;
136static Int off_D1_AcCost = 0;
137static Int off_D1_SpLoss = 1;
138static Int off_L2_AcCost = 2;
139static Int off_L2_SpLoss = 3;
140
141/* Cache access types */
142typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
143
144/* Result of a reference into a flat cache */
145typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
146
147/* Result of a reference into a hierarchical cache model */
148typedef enum {
149 L1_Hit,
150 L2_Hit,
151 MemAccess,
152 WriteBackMemAccess } CacheModelResult;
153
154typedef CacheModelResult (*simcall_type)(Addr, UChar);
155
156static struct {
157 simcall_type I1_Read;
158 simcall_type D1_Read;
159 simcall_type D1_Write;
160} simulator;
161
162/*------------------------------------------------------------*/
163/*--- Cache Simulator Initialization ---*/
164/*------------------------------------------------------------*/
165
166static void cachesim_clearcache(cache_t2* c)
167{
168 Int i;
169
170 for (i = 0; i < c->sets * c->assoc; i++)
171 c->tags[i] = 0;
172 if (c->use) {
173 for (i = 0; i < c->sets * c->assoc; i++) {
174 c->loaded[i].memline = 0;
175 c->loaded[i].use_base = 0;
176 c->loaded[i].dep_use = 0;
177 c->loaded[i].iaddr = 0;
178 c->use[i].mask = 0;
179 c->use[i].count = 0;
180 c->tags[i] = i % c->assoc; /* init lower bits as pointer */
181 }
182 }
183}
184
185static void cacheuse_initcache(cache_t2* c);
186
187/* By this point, the size/assoc/line_size has been checked. */
188static void cachesim_initcache(cache_t config, cache_t2* c)
189{
190 c->size = config.size;
191 c->assoc = config.assoc;
192 c->line_size = config.line_size;
193 c->sectored = False; // FIXME
194
195 c->sets = (c->size / c->line_size) / c->assoc;
196 c->sets_min_1 = c->sets - 1;
weidendoa17f2a32006-03-20 10:27:30 +0000197 c->line_size_bits = VG_(log2)(c->line_size);
198 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
199 c->tag_mask = ~((1<<c->tag_shift)-1);
200
201 /* Can bits in tag entries be used for flags?
202 * Should be always true as MIN_LINE_SIZE >= 16 */
203 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
204
205 if (c->assoc == 1) {
206 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
207 c->size, c->line_size,
208 c->sectored ? ", sectored":"");
209 } else {
210 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
211 c->size, c->line_size, c->assoc,
212 c->sectored ? ", sectored":"");
213 }
214
sewardj9c606bd2008-09-18 18:12:50 +0000215 c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
216 sizeof(UWord) * c->sets * c->assoc);
weidendoa17f2a32006-03-20 10:27:30 +0000217 if (clo_collect_cacheuse)
218 cacheuse_initcache(c);
219 else
220 c->use = 0;
221 cachesim_clearcache(c);
222}
223
224
225#if 0
226static void print_cache(cache_t2* c)
227{
228 UInt set, way, i;
229
230 /* Note initialisation and update of 'i'. */
231 for (i = 0, set = 0; set < c->sets; set++) {
232 for (way = 0; way < c->assoc; way++, i++) {
233 VG_(printf)("%8x ", c->tags[i]);
234 }
235 VG_(printf)("\n");
236 }
237}
238#endif
239
240
241/*------------------------------------------------------------*/
242/*--- Write Through Cache Simulation ---*/
243/*------------------------------------------------------------*/
244
245/*
246 * Simple model: L1 & L2 Write Through
247 * Does not distinguish among read and write references
248 *
249 * Simulator functions:
250 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
251 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
252 */
253
254static __inline__
255CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
256{
257 int i, j;
258 UWord *set;
259
weidendo144b76c2009-01-26 22:56:14 +0000260 set = &(c->tags[set_no * c->assoc]);
weidendoa17f2a32006-03-20 10:27:30 +0000261
262 /* This loop is unrolled for just the first case, which is the most */
263 /* common. We can't unroll any further because it would screw up */
264 /* if we have a direct-mapped (1-way) cache. */
265 if (tag == set[0])
266 return Hit;
267
268 /* If the tag is one other than the MRU, move it into the MRU spot */
269 /* and shuffle the rest down. */
270 for (i = 1; i < c->assoc; i++) {
271 if (tag == set[i]) {
272 for (j = i; j > 0; j--) {
273 set[j] = set[j - 1];
274 }
275 set[0] = tag;
276 return Hit;
277 }
278 }
279
280 /* A miss; install this tag as MRU, shuffle rest down. */
281 for (j = c->assoc - 1; j > 0; j--) {
282 set[j] = set[j - 1];
283 }
284 set[0] = tag;
285
286 return Miss;
287}
288
289static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
290{
291 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
292 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
293 UWord tag = a >> c->tag_shift;
294
295 /* Access entirely within line. */
296 if (set1 == set2)
297 return cachesim_setref(c, set1, tag);
298
299 /* Access straddles two lines. */
300 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
301 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000302 UWord tag2 = (a+size-1) >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000303
304 /* the call updates cache structures as side effect */
305 CacheResult res1 = cachesim_setref(c, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000306 CacheResult res2 = cachesim_setref(c, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000307 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
308
309 } else {
njn8a7b41b2007-09-23 00:51:24 +0000310 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendoa17f2a32006-03-20 10:27:30 +0000311 VG_(tool_panic)("item straddles more than two cache sets");
312 }
313 return Hit;
314}
315
316static
317CacheModelResult cachesim_I1_ref(Addr a, UChar size)
318{
319 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
320 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
321 return MemAccess;
322}
323
324static
325CacheModelResult cachesim_D1_ref(Addr a, UChar size)
326{
327 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
328 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
329 return MemAccess;
330}
331
332
333/*------------------------------------------------------------*/
334/*--- Write Back Cache Simulation ---*/
335/*------------------------------------------------------------*/
336
337/*
338 * More complex model: L1 Write-through, L2 Write-back
339 * This needs to distinguish among read and write references.
340 *
341 * Simulator functions:
342 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
343 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
344 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
345 */
346
347/*
348 * With write-back, result can be a miss evicting a dirty line
349 * The dirty state of a cache line is stored in Bit0 of the tag for
350 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
351 * type (Read/Write), the line gets dirty on a write.
352 */
353static __inline__
354CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
355{
356 int i, j;
357 UWord *set, tmp_tag;
358
weidendo144b76c2009-01-26 22:56:14 +0000359 set = &(c->tags[set_no * c->assoc]);
weidendoa17f2a32006-03-20 10:27:30 +0000360
361 /* This loop is unrolled for just the first case, which is the most */
362 /* common. We can't unroll any further because it would screw up */
363 /* if we have a direct-mapped (1-way) cache. */
364 if (tag == (set[0] & ~CACHELINE_DIRTY)) {
365 set[0] |= ref;
366 return Hit;
367 }
368 /* If the tag is one other than the MRU, move it into the MRU spot */
369 /* and shuffle the rest down. */
370 for (i = 1; i < c->assoc; i++) {
371 if (tag == (set[i] & ~CACHELINE_DIRTY)) {
372 tmp_tag = set[i] | ref; // update dirty flag
373 for (j = i; j > 0; j--) {
374 set[j] = set[j - 1];
375 }
376 set[0] = tmp_tag;
377 return Hit;
378 }
379 }
380
381 /* A miss; install this tag as MRU, shuffle rest down. */
382 tmp_tag = set[c->assoc - 1];
383 for (j = c->assoc - 1; j > 0; j--) {
384 set[j] = set[j - 1];
385 }
386 set[0] = tag | ref;
387
388 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
389}
390
391
392static __inline__
393CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
394{
395 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
396 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
397 UWord tag = a & c->tag_mask;
398
399 /* Access entirely within line. */
400 if (set1 == set2)
401 return cachesim_setref_wb(c, ref, set1, tag);
402
403 /* Access straddles two lines. */
404 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
405 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo144b76c2009-01-26 22:56:14 +0000406 UWord tag2 = (a+size-1) & c->tag_mask;
weidendoa17f2a32006-03-20 10:27:30 +0000407
408 /* the call updates cache structures as side effect */
409 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000410 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000411
412 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
413 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
414
415 } else {
njn8a7b41b2007-09-23 00:51:24 +0000416 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendoa17f2a32006-03-20 10:27:30 +0000417 VG_(tool_panic)("item straddles more than two cache sets");
418 }
419 return Hit;
420}
421
422
423static
424CacheModelResult cachesim_I1_Read(Addr a, UChar size)
425{
426 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
427 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
428 case Hit: return L2_Hit;
429 case Miss: return MemAccess;
430 default: break;
431 }
432 return WriteBackMemAccess;
433}
434
435static
436CacheModelResult cachesim_D1_Read(Addr a, UChar size)
437{
438 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
439 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
440 case Hit: return L2_Hit;
441 case Miss: return MemAccess;
442 default: break;
443 }
444 return WriteBackMemAccess;
445}
446
447static
448CacheModelResult cachesim_D1_Write(Addr a, UChar size)
449{
450 if ( cachesim_ref( &D1, a, size) == Hit ) {
451 /* Even for a L1 hit, the write-trough L1 passes
452 * the write to the L2 to make the L2 line dirty.
453 * But this causes no latency, so return the hit.
454 */
455 cachesim_ref_wb( &L2, Write, a, size);
456 return L1_Hit;
457 }
458 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
459 case Hit: return L2_Hit;
460 case Miss: return MemAccess;
461 default: break;
462 }
463 return WriteBackMemAccess;
464}
465
466
467/*------------------------------------------------------------*/
468/*--- Hardware Prefetch Simulation ---*/
469/*------------------------------------------------------------*/
470
471static ULong prefetch_up = 0;
472static ULong prefetch_down = 0;
473
474#define PF_STREAMS 8
475#define PF_PAGEBITS 12
476
477static UInt pf_lastblock[PF_STREAMS];
478static Int pf_seqblocks[PF_STREAMS];
479
480static
481void prefetch_clear(void)
482{
483 int i;
484 for(i=0;i<PF_STREAMS;i++)
485 pf_lastblock[i] = pf_seqblocks[i] = 0;
486}
487
488/*
489 * HW Prefetch emulation
490 * Start prefetching when detecting sequential access to 3 memory blocks.
491 * One stream can be detected per 4k page.
492 */
493static __inline__
weidendo09ee78e2009-02-24 12:26:53 +0000494void prefetch_L2_doref(Addr a)
weidendoa17f2a32006-03-20 10:27:30 +0000495{
496 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
497 UInt block = ( a >> L2.line_size_bits);
498
499 if (block != pf_lastblock[stream]) {
500 if (pf_seqblocks[stream] == 0) {
501 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
502 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
503 }
504 else if (pf_seqblocks[stream] >0) {
505 if (pf_lastblock[stream] +1 == block) {
506 pf_seqblocks[stream]++;
507 if (pf_seqblocks[stream] >= 2) {
508 prefetch_up++;
509 cachesim_ref(&L2, a + 5 * L2.line_size,1);
510 }
511 }
512 else pf_seqblocks[stream] = 0;
513 }
514 else if (pf_seqblocks[stream] <0) {
515 if (pf_lastblock[stream] -1 == block) {
516 pf_seqblocks[stream]--;
517 if (pf_seqblocks[stream] <= -2) {
518 prefetch_down++;
519 cachesim_ref(&L2, a - 5 * L2.line_size,1);
520 }
521 }
522 else pf_seqblocks[stream] = 0;
523 }
524 pf_lastblock[stream] = block;
525 }
526}
527
528/* simple model with hardware prefetch */
529
530static
531CacheModelResult prefetch_I1_ref(Addr a, UChar size)
532{
533 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
weidendo09ee78e2009-02-24 12:26:53 +0000534 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000535 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
536 return MemAccess;
537}
538
539static
540CacheModelResult prefetch_D1_ref(Addr a, UChar size)
541{
542 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
weidendo09ee78e2009-02-24 12:26:53 +0000543 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000544 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
545 return MemAccess;
546}
547
548
549/* complex model with hardware prefetch */
550
551static
552CacheModelResult prefetch_I1_Read(Addr a, UChar size)
553{
554 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
weidendo09ee78e2009-02-24 12:26:53 +0000555 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000556 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
557 case Hit: return L2_Hit;
558 case Miss: return MemAccess;
559 default: break;
560 }
561 return WriteBackMemAccess;
562}
563
564static
565CacheModelResult prefetch_D1_Read(Addr a, UChar size)
566{
567 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
weidendo09ee78e2009-02-24 12:26:53 +0000568 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000569 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
570 case Hit: return L2_Hit;
571 case Miss: return MemAccess;
572 default: break;
573 }
574 return WriteBackMemAccess;
575}
576
577static
578CacheModelResult prefetch_D1_Write(Addr a, UChar size)
579{
weidendo09ee78e2009-02-24 12:26:53 +0000580 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000581 if ( cachesim_ref( &D1, a, size) == Hit ) {
582 /* Even for a L1 hit, the write-trough L1 passes
583 * the write to the L2 to make the L2 line dirty.
584 * But this causes no latency, so return the hit.
585 */
586 cachesim_ref_wb( &L2, Write, a, size);
587 return L1_Hit;
588 }
589 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
590 case Hit: return L2_Hit;
591 case Miss: return MemAccess;
592 default: break;
593 }
594 return WriteBackMemAccess;
595}
596
597
598/*------------------------------------------------------------*/
599/*--- Cache Simulation with use metric collection ---*/
600/*------------------------------------------------------------*/
601
602/* can not be combined with write-back or prefetch */
603
604static
605void cacheuse_initcache(cache_t2* c)
606{
607 int i;
608 unsigned int start_mask, start_val;
609 unsigned int end_mask, end_val;
610
sewardj9c606bd2008-09-18 18:12:50 +0000611 c->use = CLG_MALLOC("cl.sim.cu_ic.1",
612 sizeof(line_use) * c->sets * c->assoc);
613 c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
614 sizeof(line_loaded) * c->sets * c->assoc);
615 c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
616 sizeof(int) * c->line_size);
617 c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
618 sizeof(int) * c->line_size);
weidendoa17f2a32006-03-20 10:27:30 +0000619
weidendoa17f2a32006-03-20 10:27:30 +0000620 c->line_size_mask = c->line_size-1;
621
622 /* Meaning of line_start_mask/line_end_mask
623 * Example: for a given cache line, you get an access starting at
624 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
625 * line size of 32, you have 1 bit per byte in the mask:
626 *
627 * bit31 bit8 bit5 bit 0
628 * | | | |
629 * 11..111111100000 line_start_mask[5]
630 * 00..000111111111 line_end_mask[(5+4)-1]
631 *
632 * use_mask |= line_start_mask[5] && line_end_mask[8]
633 *
634 */
635 start_val = end_val = ~0;
636 if (c->line_size < 32) {
637 int bits_per_byte = 32/c->line_size;
638 start_mask = (1<<bits_per_byte)-1;
639 end_mask = start_mask << (32-bits_per_byte);
640 for(i=0;i<c->line_size;i++) {
641 c->line_start_mask[i] = start_val;
642 start_val = start_val & ~start_mask;
643 start_mask = start_mask << bits_per_byte;
644
645 c->line_end_mask[c->line_size-i-1] = end_val;
646 end_val = end_val & ~end_mask;
647 end_mask = end_mask >> bits_per_byte;
648 }
649 }
650 else {
651 int bytes_per_bit = c->line_size/32;
652 start_mask = 1;
653 end_mask = 1 << 31;
654 for(i=0;i<c->line_size;i++) {
655 c->line_start_mask[i] = start_val;
656 c->line_end_mask[c->line_size-i-1] = end_val;
657 if ( ((i+1)%bytes_per_bit) == 0) {
658 start_val &= ~start_mask;
659 end_val &= ~end_mask;
660 start_mask <<= 1;
661 end_mask >>= 1;
662 }
663 }
664 }
665
666 CLG_DEBUG(6, "Config %s:\n", c->desc_line);
667 for(i=0;i<c->line_size;i++) {
668 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
669 i, c->line_start_mask[i], c->line_end_mask[i]);
670 }
671
672 /* We use lower tag bits as offset pointers to cache use info.
673 * I.e. some cache parameters don't work.
674 */
weidendo144b76c2009-01-26 22:56:14 +0000675 if ( (1<<c->tag_shift) < c->assoc) {
weidendoa17f2a32006-03-20 10:27:30 +0000676 VG_(message)(Vg_DebugMsg,
677 "error: Use associativity < %d for cache use statistics!",
678 (1<<c->tag_shift) );
679 VG_(tool_panic)("Unsupported cache configuration");
680 }
681}
682
weidendoa17f2a32006-03-20 10:27:30 +0000683
684/* for I1/D1 caches */
685#define CACHEUSE(L) \
686 \
687static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
688{ \
weidendo28e2a142006-11-22 21:00:53 +0000689 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
690 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
691 UWord tag = a & L.tag_mask; \
692 UWord tag2; \
weidendoa17f2a32006-03-20 10:27:30 +0000693 int i, j, idx; \
694 UWord *set, tmp_tag; \
695 UInt use_mask; \
696 \
barta0b6b2c2008-07-07 06:49:24 +0000697 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \
weidendoa17f2a32006-03-20 10:27:30 +0000698 L.name, a, size, set1, set2); \
699 \
700 /* First case: word entirely within line. */ \
701 if (set1 == set2) { \
702 \
weidendo144b76c2009-01-26 22:56:14 +0000703 set = &(L.tags[set1 * L.assoc]); \
weidendoa17f2a32006-03-20 10:27:30 +0000704 use_mask = L.line_start_mask[a & L.line_size_mask] & \
705 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
706 \
707 /* This loop is unrolled for just the first case, which is the most */\
708 /* common. We can't unroll any further because it would screw up */\
709 /* if we have a direct-mapped (1-way) cache. */\
710 if (tag == (set[0] & L.tag_mask)) { \
weidendo144b76c2009-01-26 22:56:14 +0000711 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000712 L.use[idx].count ++; \
713 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000714 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000715 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
716 use_mask, L.use[idx].mask, L.use[idx].count); \
717 return L1_Hit; \
718 } \
719 /* If the tag is one other than the MRU, move it into the MRU spot */\
720 /* and shuffle the rest down. */\
721 for (i = 1; i < L.assoc; i++) { \
722 if (tag == (set[i] & L.tag_mask)) { \
723 tmp_tag = set[i]; \
724 for (j = i; j > 0; j--) { \
725 set[j] = set[j - 1]; \
726 } \
727 set[0] = tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000728 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000729 L.use[idx].count ++; \
730 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000731 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000732 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
733 use_mask, L.use[idx].mask, L.use[idx].count); \
734 return L1_Hit; \
735 } \
736 } \
737 \
738 /* A miss; install this tag as MRU, shuffle rest down. */ \
739 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
740 for (j = L.assoc - 1; j > 0; j--) { \
741 set[j] = set[j - 1]; \
742 } \
743 set[0] = tag | tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000744 idx = (set1 * L.assoc) + tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000745 return update_##L##_use(&L, idx, \
746 use_mask, a &~ L.line_size_mask); \
747 \
748 /* Second case: word straddles two lines. */ \
749 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
750 } else if (((set1 + 1) & (L.sets-1)) == set2) { \
751 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
weidendo144b76c2009-01-26 22:56:14 +0000752 set = &(L.tags[set1 * L.assoc]); \
weidendoa17f2a32006-03-20 10:27:30 +0000753 use_mask = L.line_start_mask[a & L.line_size_mask]; \
754 if (tag == (set[0] & L.tag_mask)) { \
weidendo144b76c2009-01-26 22:56:14 +0000755 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000756 L.use[idx].count ++; \
757 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000758 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000759 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
760 use_mask, L.use[idx].mask, L.use[idx].count); \
761 goto block2; \
762 } \
763 for (i = 1; i < L.assoc; i++) { \
764 if (tag == (set[i] & L.tag_mask)) { \
765 tmp_tag = set[i]; \
766 for (j = i; j > 0; j--) { \
767 set[j] = set[j - 1]; \
768 } \
769 set[0] = tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000770 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000771 L.use[idx].count ++; \
772 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000773 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000774 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
775 use_mask, L.use[idx].mask, L.use[idx].count); \
776 goto block2; \
777 } \
778 } \
779 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
780 for (j = L.assoc - 1; j > 0; j--) { \
781 set[j] = set[j - 1]; \
782 } \
783 set[0] = tag | tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000784 idx = (set1 * L.assoc) + tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000785 miss1 = update_##L##_use(&L, idx, \
786 use_mask, a &~ L.line_size_mask); \
787block2: \
weidendo144b76c2009-01-26 22:56:14 +0000788 set = &(L.tags[set2 * L.assoc]); \
weidendoa17f2a32006-03-20 10:27:30 +0000789 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
weidendo28e2a142006-11-22 21:00:53 +0000790 tag2 = (a+size-1) & L.tag_mask; \
791 if (tag2 == (set[0] & L.tag_mask)) { \
weidendo144b76c2009-01-26 22:56:14 +0000792 idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000793 L.use[idx].count ++; \
794 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000795 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000796 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
797 use_mask, L.use[idx].mask, L.use[idx].count); \
798 return miss1; \
799 } \
800 for (i = 1; i < L.assoc; i++) { \
weidendo28e2a142006-11-22 21:00:53 +0000801 if (tag2 == (set[i] & L.tag_mask)) { \
weidendoa17f2a32006-03-20 10:27:30 +0000802 tmp_tag = set[i]; \
803 for (j = i; j > 0; j--) { \
804 set[j] = set[j - 1]; \
805 } \
806 set[0] = tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000807 idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000808 L.use[idx].count ++; \
809 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000810 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000811 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
812 use_mask, L.use[idx].mask, L.use[idx].count); \
813 return miss1; \
814 } \
815 } \
816 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
817 for (j = L.assoc - 1; j > 0; j--) { \
818 set[j] = set[j - 1]; \
819 } \
weidendo28e2a142006-11-22 21:00:53 +0000820 set[0] = tag2 | tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000821 idx = (set2 * L.assoc) + tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000822 miss2 = update_##L##_use(&L, idx, \
823 use_mask, (a+size-1) &~ L.line_size_mask); \
824 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
825 \
826 } else { \
barta0b6b2c2008-07-07 06:49:24 +0000827 VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
weidendoa17f2a32006-03-20 10:27:30 +0000828 VG_(tool_panic)("item straddles more than two cache sets"); \
829 } \
830 return 0; \
831}
832
833
834/* logarithmic bitcounting algorithm, see
835 * http://graphics.stanford.edu/~seander/bithacks.html
836 */
837static __inline__ unsigned int countBits(unsigned int bits)
838{
839 unsigned int c; // store the total here
840 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
841 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
842
843 c = bits;
844 c = ((c >> S[0]) & B[0]) + (c & B[0]);
845 c = ((c >> S[1]) & B[1]) + (c & B[1]);
846 c = ((c >> S[2]) & B[2]) + (c & B[2]);
847 c = ((c >> S[3]) & B[3]) + (c & B[3]);
848 c = ((c >> S[4]) & B[4]) + (c & B[4]);
849 return c;
850}
851
852static void update_L2_use(int idx, Addr memline)
853{
854 line_loaded* loaded = &(L2.loaded[idx]);
855 line_use* use = &(L2.use[idx]);
856 int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
857
barta0b6b2c2008-07-07 06:49:24 +0000858 CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
weidendoa17f2a32006-03-20 10:27:30 +0000859 idx, bb_base + current_ii->instr_offset, memline);
860 if (use->count>0) {
barta0b6b2c2008-07-07 06:49:24 +0000861 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
weidendoa17f2a32006-03-20 10:27:30 +0000862 use->count, i, use->mask, loaded->memline, loaded->iaddr);
863 CLG_DEBUG(2, " collect: %d, use_base %p\n",
864 CLG_(current_state).collect, loaded->use_base);
865
866 if (CLG_(current_state).collect && loaded->use_base) {
867 (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
868 (loaded->use_base)[off_L2_SpLoss] += i;
869 }
870 }
871
872 use->count = 0;
873 use->mask = 0;
874
875 loaded->memline = memline;
876 loaded->iaddr = bb_base + current_ii->instr_offset;
877 loaded->use_base = (CLG_(current_state).nonskipped) ?
878 CLG_(current_state).nonskipped->skipped :
879 cost_base + current_ii->cost_offset;
880}
881
882static
883CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
884{
885 UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
weidendo144b76c2009-01-26 22:56:14 +0000886 UWord* set = &(L2.tags[setNo * L2.assoc]);
weidendoa17f2a32006-03-20 10:27:30 +0000887 UWord tag = memline & L2.tag_mask;
888
889 int i, j, idx;
890 UWord tmp_tag;
891
barta0b6b2c2008-07-07 06:49:24 +0000892 CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
weidendoa17f2a32006-03-20 10:27:30 +0000893
894 if (tag == (set[0] & L2.tag_mask)) {
weidendo144b76c2009-01-26 22:56:14 +0000895 idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
weidendoa17f2a32006-03-20 10:27:30 +0000896 l1_loaded->dep_use = &(L2.use[idx]);
897
barta0b6b2c2008-07-07 06:49:24 +0000898 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +0000899 idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
900 L2.use[idx].mask, L2.use[idx].count);
901 return L2_Hit;
902 }
903 for (i = 1; i < L2.assoc; i++) {
904 if (tag == (set[i] & L2.tag_mask)) {
905 tmp_tag = set[i];
906 for (j = i; j > 0; j--) {
907 set[j] = set[j - 1];
908 }
909 set[0] = tmp_tag;
weidendo144b76c2009-01-26 22:56:14 +0000910 idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
weidendoa17f2a32006-03-20 10:27:30 +0000911 l1_loaded->dep_use = &(L2.use[idx]);
912
barta0b6b2c2008-07-07 06:49:24 +0000913 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +0000914 i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
915 L2.use[idx].mask, L2.use[idx].count);
916 return L2_Hit;
917 }
918 }
919
920 /* A miss; install this tag as MRU, shuffle rest down. */
921 tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
922 for (j = L2.assoc - 1; j > 0; j--) {
923 set[j] = set[j - 1];
924 }
925 set[0] = tag | tmp_tag;
weidendo144b76c2009-01-26 22:56:14 +0000926 idx = (setNo * L2.assoc) + tmp_tag;
weidendoa17f2a32006-03-20 10:27:30 +0000927 l1_loaded->dep_use = &(L2.use[idx]);
928
929 update_L2_use(idx, memline);
930
931 return MemAccess;
932}
933
934
935
936
937#define UPDATE_USE(L) \
938 \
939static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
940 UInt mask, Addr memline) \
941{ \
942 line_loaded* loaded = &(cache->loaded[idx]); \
943 line_use* use = &(cache->use[idx]); \
944 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
945 \
barta0b6b2c2008-07-07 06:49:24 +0000946 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
weidendoa17f2a32006-03-20 10:27:30 +0000947 cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
948 if (use->count>0) { \
barta0b6b2c2008-07-07 06:49:24 +0000949 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000950 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
951 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
952 CLG_(current_state).collect, loaded->use_base); \
953 \
954 if (CLG_(current_state).collect && loaded->use_base) { \
955 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
956 (loaded->use_base)[off_##L##_SpLoss] += c; \
957 \
958 /* FIXME (?): L1/L2 line sizes must be equal ! */ \
959 loaded->dep_use->mask |= use->mask; \
960 loaded->dep_use->count += use->count; \
961 } \
962 } \
963 \
964 use->count = 1; \
965 use->mask = mask; \
966 loaded->memline = memline; \
967 loaded->iaddr = bb_base + current_ii->instr_offset; \
968 loaded->use_base = (CLG_(current_state).nonskipped) ? \
969 CLG_(current_state).nonskipped->skipped : \
970 cost_base + current_ii->cost_offset; \
971 \
972 if (memline == 0) return L2_Hit; \
973 return cacheuse_L2_access(memline, loaded); \
974}
975
976UPDATE_USE(I1);
977UPDATE_USE(D1);
978
979CACHEUSE(I1);
980CACHEUSE(D1);
981
982
983static
984void cacheuse_finish(void)
985{
986 int i;
987 InstrInfo ii = { 0,0,0,0,0 };
988
989 if (!CLG_(current_state).collect) return;
990
991 bb_base = 0;
992 current_ii = &ii;
993 cost_base = 0;
994
995 /* update usage counters */
996 if (I1.use)
997 for (i = 0; i < I1.sets * I1.assoc; i++)
998 if (I1.loaded[i].use_base)
999 update_I1_use( &I1, i, 0,0);
1000
1001 if (D1.use)
1002 for (i = 0; i < D1.sets * D1.assoc; i++)
1003 if (D1.loaded[i].use_base)
1004 update_D1_use( &D1, i, 0,0);
1005
1006 if (L2.use)
1007 for (i = 0; i < L2.sets * L2.assoc; i++)
1008 if (L2.loaded[i].use_base)
1009 update_L2_use(i, 0);
1010}
1011
1012
1013
1014/*------------------------------------------------------------*/
1015/*--- Helper functions called by instrumented code ---*/
1016/*------------------------------------------------------------*/
1017
1018
1019static __inline__
1020void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1021{
1022 switch(r) {
1023 case WriteBackMemAccess:
1024 if (clo_simulate_writeback) {
1025 c1[3]++;
1026 c2[3]++;
1027 }
1028 // fall through
1029
1030 case MemAccess:
1031 c1[2]++;
1032 c2[2]++;
1033 // fall through
1034
1035 case L2_Hit:
1036 c1[1]++;
1037 c2[1]++;
1038 // fall through
1039
1040 default:
1041 c1[0]++;
1042 c2[0]++;
1043 }
1044}
1045
1046
1047VG_REGPARM(1)
1048static void log_1I0D(InstrInfo* ii)
1049{
1050 CacheModelResult IrRes;
1051
1052 current_ii = ii;
1053 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1054
barta0b6b2c2008-07-07 06:49:24 +00001055 CLG_DEBUG(6, "log_1I0D: Ir=%#lx/%u => Ir %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001056 bb_base + ii->instr_offset, ii->instr_size, IrRes);
1057
1058 if (CLG_(current_state).collect) {
1059 ULong* cost_Ir;
1060
1061 if (CLG_(current_state).nonskipped)
1062 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1063 else
1064 cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
1065
1066 inc_costs(IrRes, cost_Ir,
1067 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1068 }
1069}
1070
1071
1072/* Instruction doing a read access */
1073
1074VG_REGPARM(2)
1075static void log_1I1Dr(InstrInfo* ii, Addr data)
1076{
1077 CacheModelResult IrRes, DrRes;
1078
1079 current_ii = ii;
1080 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1081 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1082
barta0b6b2c2008-07-07 06:49:24 +00001083 CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001084 bb_base + ii->instr_offset, ii->instr_size,
1085 data, ii->data_size, IrRes, DrRes);
1086
1087 if (CLG_(current_state).collect) {
1088 ULong *cost_Ir, *cost_Dr;
1089
1090 if (CLG_(current_state).nonskipped) {
1091 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1092 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1093 }
1094 else {
1095 cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
1096 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1097 }
1098
1099 inc_costs(IrRes, cost_Ir,
1100 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1101 inc_costs(DrRes, cost_Dr,
1102 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1103 }
1104}
1105
1106
1107VG_REGPARM(2)
1108static void log_0I1Dr(InstrInfo* ii, Addr data)
1109{
1110 CacheModelResult DrRes;
1111
1112 current_ii = ii;
1113 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1114
barta0b6b2c2008-07-07 06:49:24 +00001115 CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001116 data, ii->data_size, DrRes);
1117
1118 if (CLG_(current_state).collect) {
1119 ULong *cost_Dr;
1120
1121 if (CLG_(current_state).nonskipped) {
1122 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1123 }
1124 else {
1125 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1126 }
1127
1128 inc_costs(DrRes, cost_Dr,
1129 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1130 }
1131}
1132
1133
1134/* Instruction doing a write access */
1135
1136VG_REGPARM(2)
1137static void log_1I1Dw(InstrInfo* ii, Addr data)
1138{
1139 CacheModelResult IrRes, DwRes;
1140
1141 current_ii = ii;
1142 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1143 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1144
barta0b6b2c2008-07-07 06:49:24 +00001145 CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001146 bb_base + ii->instr_offset, ii->instr_size,
1147 data, ii->data_size, IrRes, DwRes);
1148
1149 if (CLG_(current_state).collect) {
1150 ULong *cost_Ir, *cost_Dw;
1151
1152 if (CLG_(current_state).nonskipped) {
1153 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1154 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1155 }
1156 else {
1157 cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
1158 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1159 }
1160
1161 inc_costs(IrRes, cost_Ir,
1162 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1163 inc_costs(DwRes, cost_Dw,
1164 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1165 }
1166}
1167
1168VG_REGPARM(2)
1169static void log_0I1Dw(InstrInfo* ii, Addr data)
1170{
1171 CacheModelResult DwRes;
1172
1173 current_ii = ii;
1174 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1175
barta0b6b2c2008-07-07 06:49:24 +00001176 CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001177 data, ii->data_size, DwRes);
1178
1179 if (CLG_(current_state).collect) {
1180 ULong *cost_Dw;
1181
1182 if (CLG_(current_state).nonskipped) {
1183 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1184 }
1185 else {
1186 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1187 }
1188
1189 inc_costs(DwRes, cost_Dw,
1190 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1191 }
1192}
1193
1194/* Instruction doing a read and a write access */
1195
1196VG_REGPARM(3)
1197static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
1198{
1199 CacheModelResult IrRes, DrRes, DwRes;
1200
1201 current_ii = ii;
1202 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1203 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1204 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1205
1206 CLG_DEBUG(6,
barta0b6b2c2008-07-07 06:49:24 +00001207 "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001208 bb_base + ii->instr_offset, ii->instr_size,
1209 data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
1210
1211 if (CLG_(current_state).collect) {
1212 ULong *cost_Ir, *cost_Dr, *cost_Dw;
1213
1214 if (CLG_(current_state).nonskipped) {
1215 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1216 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1217 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1218 }
1219 else {
1220 cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
1221 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1222 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1223 }
1224
1225 inc_costs(IrRes, cost_Ir,
1226 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1227 inc_costs(DrRes, cost_Dr,
1228 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1229 inc_costs(DwRes, cost_Dw,
1230 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1231 }
1232}
1233
1234VG_REGPARM(3)
1235static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
1236{
1237 CacheModelResult DrRes, DwRes;
1238
1239 current_ii = ii;
1240 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1241 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1242
1243 CLG_DEBUG(6,
barta0b6b2c2008-07-07 06:49:24 +00001244 "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001245 data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
1246
1247 if (CLG_(current_state).collect) {
1248 ULong *cost_Dr, *cost_Dw;
1249
1250 if (CLG_(current_state).nonskipped) {
1251 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1252 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1253 }
1254 else {
1255 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1256 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1257 }
1258
1259 inc_costs(DrRes, cost_Dr,
1260 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1261 inc_costs(DwRes, cost_Dw,
1262 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1263 }
1264}
1265
1266
1267/*------------------------------------------------------------*/
1268/*--- Cache configuration ---*/
1269/*------------------------------------------------------------*/
1270
1271#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
1272
1273static cache_t clo_I1_cache = UNDEFINED_CACHE;
1274static cache_t clo_D1_cache = UNDEFINED_CACHE;
1275static cache_t clo_L2_cache = UNDEFINED_CACHE;
1276
1277
1278/* Checks cache config is ok; makes it so if not. */
1279static
1280void check_cache(cache_t* cache, Char *name)
1281{
weidendo144b76c2009-01-26 22:56:14 +00001282 /* Simulator requires line size and set count to be powers of two */
1283 if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
1284 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
weidendoa17f2a32006-03-20 10:27:30 +00001285 VG_(message)(Vg_UserMsg,
weidendo144b76c2009-01-26 22:56:14 +00001286 "error: %s set count not a power of two; aborting.",
1287 name);
weidendoa17f2a32006-03-20 10:27:30 +00001288 }
1289
weidendo144b76c2009-01-26 22:56:14 +00001290 if (-1 == VG_(log2)(cache->line_size)) {
weidendoa17f2a32006-03-20 10:27:30 +00001291 VG_(message)(Vg_UserMsg,
1292 "error: %s line size of %dB not a power of two; aborting.",
1293 name, cache->line_size);
1294 VG_(exit)(1);
1295 }
1296
1297 // Then check line size >= 16 -- any smaller and a single instruction could
1298 // straddle three cache lines, which breaks a simulation assertion and is
1299 // stupid anyway.
1300 if (cache->line_size < MIN_LINE_SIZE) {
1301 VG_(message)(Vg_UserMsg,
1302 "error: %s line size of %dB too small; aborting.",
1303 name, cache->line_size);
1304 VG_(exit)(1);
1305 }
1306
1307 /* Then check cache size > line size (causes seg faults if not). */
1308 if (cache->size <= cache->line_size) {
1309 VG_(message)(Vg_UserMsg,
1310 "error: %s cache size of %dB <= line size of %dB; aborting.",
1311 name, cache->size, cache->line_size);
1312 VG_(exit)(1);
1313 }
1314
1315 /* Then check assoc <= (size / line size) (seg faults otherwise). */
1316 if (cache->assoc > (cache->size / cache->line_size)) {
1317 VG_(message)(Vg_UserMsg,
1318 "warning: %s associativity > (size / line size); aborting.", name);
1319 VG_(exit)(1);
1320 }
1321}
1322
1323static
1324void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1325{
1326#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
1327
1328 Int n_clos = 0;
1329
1330 // Count how many were defined on the command line.
1331 if (DEFINED(clo_I1_cache)) { n_clos++; }
1332 if (DEFINED(clo_D1_cache)) { n_clos++; }
1333 if (DEFINED(clo_L2_cache)) { n_clos++; }
1334
1335 // Set the cache config (using auto-detection, if supported by the
1336 // architecture)
1337 VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1338
1339 // Then replace with any defined on the command line.
1340 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1341 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1342 if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1343
1344 // Then check values and fix if not acceptable.
1345 check_cache(I1c, "I1");
1346 check_cache(D1c, "D1");
1347 check_cache(L2c, "L2");
1348
1349 if (VG_(clo_verbosity) > 1) {
1350 VG_(message)(Vg_UserMsg, "Cache configuration used:");
1351 VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
1352 I1c->size, I1c->assoc, I1c->line_size);
1353 VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
1354 D1c->size, D1c->assoc, D1c->line_size);
1355 VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
1356 L2c->size, L2c->assoc, L2c->line_size);
1357 }
1358#undef CMD_LINE_DEFINED
1359}
1360
1361
1362/* Initialize and clear simulator state */
1363static void cachesim_post_clo_init(void)
1364{
1365 /* Cache configurations. */
1366 cache_t I1c, D1c, L2c;
1367
1368 /* Initialize access handlers */
1369 if (!CLG_(clo).simulate_cache) {
1370 CLG_(cachesim).log_1I0D = 0;
1371 CLG_(cachesim).log_1I0D_name = "(no function)";
1372
1373 CLG_(cachesim).log_1I1Dr = 0;
1374 CLG_(cachesim).log_1I1Dw = 0;
1375 CLG_(cachesim).log_1I2D = 0;
1376 CLG_(cachesim).log_1I1Dr_name = "(no function)";
1377 CLG_(cachesim).log_1I1Dw_name = "(no function)";
1378 CLG_(cachesim).log_1I2D_name = "(no function)";
1379
1380 CLG_(cachesim).log_0I1Dr = 0;
1381 CLG_(cachesim).log_0I1Dw = 0;
1382 CLG_(cachesim).log_0I2D = 0;
1383 CLG_(cachesim).log_0I1Dr_name = "(no function)";
1384 CLG_(cachesim).log_0I1Dw_name = "(no function)";
1385 CLG_(cachesim).log_0I2D_name = "(no function)";
1386 return;
1387 }
1388
1389 /* Configuration of caches only needed with real cache simulation */
1390 configure_caches(&I1c, &D1c, &L2c);
1391
1392 I1.name = "I1";
1393 D1.name = "D1";
1394 L2.name = "L2";
1395
1396 cachesim_initcache(I1c, &I1);
1397 cachesim_initcache(D1c, &D1);
1398 cachesim_initcache(L2c, &L2);
1399
1400 /* the other cache simulators use the standard helpers
1401 * with dispatching via simulator struct */
1402
1403 CLG_(cachesim).log_1I0D = log_1I0D;
1404 CLG_(cachesim).log_1I0D_name = "log_1I0D";
1405
1406 CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1407 CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1408 CLG_(cachesim).log_1I2D = log_1I2D;
1409 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1410 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1411 CLG_(cachesim).log_1I2D_name = "log_1I2D";
1412
1413 CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1414 CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1415 CLG_(cachesim).log_0I2D = log_0I2D;
1416 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1417 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1418 CLG_(cachesim).log_0I2D_name = "log_0I2D";
1419
1420 if (clo_collect_cacheuse) {
1421
1422 /* Output warning for not supported option combinations */
1423 if (clo_simulate_hwpref) {
1424 VG_(message)(Vg_DebugMsg,
1425 "warning: prefetch simulation can not be used with cache usage");
1426 clo_simulate_hwpref = False;
1427 }
1428
1429 if (clo_simulate_writeback) {
1430 VG_(message)(Vg_DebugMsg,
1431 "warning: write-back simulation can not be used with cache usage");
1432 clo_simulate_writeback = False;
1433 }
1434
1435 simulator.I1_Read = cacheuse_I1_doRead;
1436 simulator.D1_Read = cacheuse_D1_doRead;
1437 simulator.D1_Write = cacheuse_D1_doRead;
1438 return;
1439 }
1440
1441 if (clo_simulate_hwpref) {
1442 prefetch_clear();
1443
1444 if (clo_simulate_writeback) {
1445 simulator.I1_Read = prefetch_I1_Read;
1446 simulator.D1_Read = prefetch_D1_Read;
1447 simulator.D1_Write = prefetch_D1_Write;
1448 }
1449 else {
1450 simulator.I1_Read = prefetch_I1_ref;
1451 simulator.D1_Read = prefetch_D1_ref;
1452 simulator.D1_Write = prefetch_D1_ref;
1453 }
1454
1455 return;
1456 }
1457
1458 if (clo_simulate_writeback) {
1459 simulator.I1_Read = cachesim_I1_Read;
1460 simulator.D1_Read = cachesim_D1_Read;
1461 simulator.D1_Write = cachesim_D1_Write;
1462 }
1463 else {
1464 simulator.I1_Read = cachesim_I1_ref;
1465 simulator.D1_Read = cachesim_D1_ref;
1466 simulator.D1_Write = cachesim_D1_ref;
1467 }
1468}
1469
1470
1471/* Clear simulator state. Has to be initialized before */
1472static
1473void cachesim_clear(void)
1474{
1475 cachesim_clearcache(&I1);
1476 cachesim_clearcache(&D1);
1477 cachesim_clearcache(&L2);
1478
1479 prefetch_clear();
1480}
1481
1482
1483static void cachesim_getdesc(Char* buf)
1484{
1485 Int p;
1486 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1487 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1488 VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1489}
1490
1491static
1492void cachesim_print_opts(void)
1493{
1494 VG_(printf)(
1495"\n cache simulator options:\n"
1496" --simulate-cache=no|yes Do cache simulation [no]\n"
1497" --simulate-wb=no|yes Count write-back events [no]\n"
1498" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1499#if CLG_EXPERIMENTAL
1500" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1501#endif
1502" --cacheuse=no|yes Collect cache block use [no]\n"
1503" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
1504" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
1505" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
1506 );
1507}
1508
njn83df0b62009-02-25 01:01:05 +00001509static void parse_opt ( cache_t* cache, char* opt )
weidendoa17f2a32006-03-20 10:27:30 +00001510{
njn83df0b62009-02-25 01:01:05 +00001511 Long i1, i2, i3;
1512 Char* endptr;
weidendoa17f2a32006-03-20 10:27:30 +00001513
njn83df0b62009-02-25 01:01:05 +00001514 // Option argument looks like "65536,2,64". Extract them.
1515 i1 = VG_(strtoll10)(opt, &endptr); if (*endptr != ',') goto bad;
1516 i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',') goto bad;
1517 i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
weidendoa17f2a32006-03-20 10:27:30 +00001518
njn83df0b62009-02-25 01:01:05 +00001519 // Check for overflow.
1520 cache->size = (Int)i1;
1521 cache->assoc = (Int)i2;
1522 cache->line_size = (Int)i3;
1523 if (cache->size != i1) goto overflow;
1524 if (cache->assoc != i2) goto overflow;
1525 if (cache->line_size != i3) goto overflow;
weidendoa17f2a32006-03-20 10:27:30 +00001526
1527 return;
1528
njn83df0b62009-02-25 01:01:05 +00001529 overflow:
1530 VG_(message)(Vg_UserMsg,
1531 "one of the cache parameters was too large and overflowed\n");
weidendoa17f2a32006-03-20 10:27:30 +00001532 bad:
njn83df0b62009-02-25 01:01:05 +00001533 // XXX: this omits the "--I1/D1/L2=" part from the message, but that's
1534 // not a big deal.
1535 VG_(err_bad_option)(opt);
weidendoa17f2a32006-03-20 10:27:30 +00001536}
1537
1538/* Check for command line option for cache configuration.
1539 * Return False if unknown and not handled.
1540 *
1541 * Called from CLG_(process_cmd_line_option)() in clo.c
1542 */
1543static Bool cachesim_parse_opt(Char* arg)
1544{
njn83df0b62009-02-25 01:01:05 +00001545 Char* tmp_str;
weidendoa17f2a32006-03-20 10:27:30 +00001546
njn83df0b62009-02-25 01:01:05 +00001547 if VG_BOOL_CLO(arg, "--simulate-wb", clo_simulate_writeback) {}
1548 else if VG_BOOL_CLO(arg, "--simulate-hwpref", clo_simulate_hwpref) {}
1549 else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors) {}
weidendoa17f2a32006-03-20 10:27:30 +00001550
njn83df0b62009-02-25 01:01:05 +00001551 else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1552 if (clo_collect_cacheuse) {
1553 /* Use counters only make sense with fine dumping */
1554 CLG_(clo).dump_instr = True;
1555 }
1556 }
weidendoa17f2a32006-03-20 10:27:30 +00001557
njn83df0b62009-02-25 01:01:05 +00001558 else if VG_STR_CLO(arg, "--I1", tmp_str)
1559 parse_opt(&clo_I1_cache, tmp_str);
1560 else if VG_STR_CLO(arg, "--D1", tmp_str)
1561 parse_opt(&clo_D1_cache, tmp_str);
1562 else if VG_STR_CLO(arg, "--L2", tmp_str)
1563 parse_opt(&clo_L2_cache, tmp_str);
weidendoa17f2a32006-03-20 10:27:30 +00001564 else
1565 return False;
1566
1567 return True;
1568}
1569
1570/* Adds commas to ULong, right justifying in a field field_width wide, returns
1571 * the string in buf. */
1572static
1573Int commify(ULong n, int field_width, char* buf)
1574{
1575 int len, n_commas, i, j, new_len, space;
1576
1577 VG_(sprintf)(buf, "%llu", n);
1578 len = VG_(strlen)(buf);
1579 n_commas = (len - 1) / 3;
1580 new_len = len + n_commas;
1581 space = field_width - new_len;
1582
1583 /* Allow for printing a number in a field_width smaller than it's size */
1584 if (space < 0) space = 0;
1585
1586 /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1587 * of three. */
1588 for (j = -1, i = len ; i >= 0; i--) {
1589 buf[i + n_commas + space] = buf[i];
1590
1591 if ((i>0) && (3 == ++j)) {
1592 j = 0;
1593 n_commas--;
1594 buf[i + n_commas + space] = ',';
1595 }
1596 }
1597 /* Right justify in field. */
1598 for (i = 0; i < space; i++) buf[i] = ' ';
1599 return new_len;
1600}
1601
1602static
1603void percentify(Int n, Int ex, Int field_width, char buf[])
1604{
1605 int i, len, space;
1606
1607 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1608 len = VG_(strlen)(buf);
1609 space = field_width - len;
1610 if (space < 0) space = 0; /* Allow for v. small field_width */
1611 i = len;
1612
1613 /* Right justify in field */
1614 for ( ; i >= 0; i--) buf[i + space] = buf[i];
1615 for (i = 0; i < space; i++) buf[i] = ' ';
1616}
1617
1618static
1619void cachesim_printstat(void)
1620{
1621 FullCost total = CLG_(total_cost), D_total = 0;
1622 ULong L2_total_m, L2_total_mr, L2_total_mw,
1623 L2_total, L2_total_r, L2_total_w;
1624 char buf1[RESULTS_BUF_LEN],
1625 buf2[RESULTS_BUF_LEN],
1626 buf3[RESULTS_BUF_LEN];
1627 Int l1, l2, l3;
1628 Int p;
1629
1630 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1631 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
1632 prefetch_up);
1633 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
1634 prefetch_down);
1635 VG_(message)(Vg_DebugMsg, "");
1636 }
1637
1638 /* I cache results. Use the I_refs value to determine the first column
1639 * width. */
1640 l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1641 VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
1642
1643 if (!CLG_(clo).simulate_cache) return;
1644
1645 commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1646 VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
1647
1648 commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1649 VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
1650
1651 p = 100;
1652
1653 if (0 == total[CLG_(sets).off_full_Ir])
1654 total[CLG_(sets).off_full_Ir] = 1;
1655
1656 percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1657 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1658 VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
1659
1660 percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1661 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1662 VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
1663 VG_(message)(Vg_UserMsg, "");
1664
1665 /* D cache results.
1666 Use the D_refs.rd and D_refs.wr values to determine the
1667 * width of columns 2 & 3. */
1668
1669 D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1670 CLG_(init_cost)( CLG_(sets).full, D_total);
1671 CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1672 CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1673
1674 commify( D_total[0], l1, buf1);
1675 l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
1676 l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
1677 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
1678 buf1, buf2, buf3);
1679
1680 commify( D_total[1], l1, buf1);
1681 commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1682 commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1683 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
1684 buf1, buf2, buf3);
1685
1686 commify( D_total[2], l1, buf1);
1687 commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1688 commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1689 VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
1690 buf1, buf2, buf3);
1691
1692 p = 10;
1693
1694 if (0 == D_total[0]) D_total[0] = 1;
1695 if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1696 if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1697
1698 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
1699 percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1700 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1701 percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1702 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1703 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
1704
1705 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
1706 percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1707 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1708 percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1709 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1710 VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
1711 VG_(message)(Vg_UserMsg, "");
1712
1713
1714
1715 /* L2 overall results */
1716
1717 L2_total =
1718 total[CLG_(sets).off_full_Dr +1] +
1719 total[CLG_(sets).off_full_Dw +1] +
1720 total[CLG_(sets).off_full_Ir +1];
1721 L2_total_r =
1722 total[CLG_(sets).off_full_Dr +1] +
1723 total[CLG_(sets).off_full_Ir +1];
1724 L2_total_w = total[CLG_(sets).off_full_Dw +1];
1725 commify(L2_total, l1, buf1);
1726 commify(L2_total_r, l2, buf2);
1727 commify(L2_total_w, l3, buf3);
1728 VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
1729 buf1, buf2, buf3);
1730
1731 L2_total_m =
1732 total[CLG_(sets).off_full_Dr +2] +
1733 total[CLG_(sets).off_full_Dw +2] +
1734 total[CLG_(sets).off_full_Ir +2];
1735 L2_total_mr =
1736 total[CLG_(sets).off_full_Dr +2] +
1737 total[CLG_(sets).off_full_Ir +2];
1738 L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1739 commify(L2_total_m, l1, buf1);
1740 commify(L2_total_mr, l2, buf2);
1741 commify(L2_total_mw, l3, buf3);
1742 VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
1743 buf1, buf2, buf3);
1744
1745 percentify(L2_total_m * 100 * p /
1746 (total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
1747 percentify(L2_total_mr * 100 * p /
1748 (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1749 p, l2+1, buf2);
1750 percentify(L2_total_mw * 100 * p /
1751 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1752 VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
1753 buf1, buf2,buf3);
1754}
1755
1756
1757/*------------------------------------------------------------*/
1758/*--- Setup for Event set. ---*/
1759/*------------------------------------------------------------*/
1760
1761struct event_sets CLG_(sets);
1762
1763void CLG_(init_eventsets)(Int max_user)
1764{
1765 EventType * e1, *e2, *e3, *e4;
1766 EventSet *Ir, *Dr, *Dw;
1767 EventSet *D0, *D1r, *D1w, *D2;
1768 EventSet *sim, *full;
1769 EventSet *use;
1770 int sizeOfUseIr;
1771
1772 use = CLG_(get_eventset)("Use", 4);
1773 if (clo_collect_cacheuse) {
1774 /* if TUse is 0, there was never a load, and no loss, too */
1775 e1 = CLG_(register_eventtype)("AcCost1");
1776 CLG_(add_eventtype)(use, e1);
1777 e1 = CLG_(register_eventtype)("SpLoss1");
1778 CLG_(add_eventtype)(use, e1);
1779 e1 = CLG_(register_eventtype)("AcCost2");
1780 CLG_(add_eventtype)(use, e1);
1781 e1 = CLG_(register_eventtype)("SpLoss2");
1782 CLG_(add_eventtype)(use, e1);
1783 }
1784
1785 Ir = CLG_(get_eventset)("Ir", 4);
1786 Dr = CLG_(get_eventset)("Dr", 4);
1787 Dw = CLG_(get_eventset)("Dw", 4);
1788 if (CLG_(clo).simulate_cache) {
1789 e1 = CLG_(register_eventtype)("Ir");
1790 e2 = CLG_(register_eventtype)("I1mr");
1791 e3 = CLG_(register_eventtype)("I2mr");
1792 if (clo_simulate_writeback) {
1793 e4 = CLG_(register_eventtype)("I2dmr");
1794 CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1795 }
1796 else
1797 CLG_(add_dep_event3)(Ir, e1,e2,e3);
1798
1799 e1 = CLG_(register_eventtype)("Dr");
1800 e2 = CLG_(register_eventtype)("D1mr");
1801 e3 = CLG_(register_eventtype)("D2mr");
1802 if (clo_simulate_writeback) {
1803 e4 = CLG_(register_eventtype)("D2dmr");
1804 CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1805 }
1806 else
1807 CLG_(add_dep_event3)(Dr, e1,e2,e3);
1808
1809 e1 = CLG_(register_eventtype)("Dw");
1810 e2 = CLG_(register_eventtype)("D1mw");
1811 e3 = CLG_(register_eventtype)("D2mw");
1812 if (clo_simulate_writeback) {
1813 e4 = CLG_(register_eventtype)("D2dmw");
1814 CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1815 }
1816 else
1817 CLG_(add_dep_event3)(Dw, e1,e2,e3);
1818
1819 }
1820 else {
1821 e1 = CLG_(register_eventtype)("Ir");
1822 CLG_(add_eventtype)(Ir, e1);
1823 }
1824
1825 sizeOfUseIr = use->size + Ir->size;
1826 D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
1827 CLG_(add_eventset)(D0, use);
1828 off_D0_Ir = CLG_(add_eventset)(D0, Ir);
1829
1830 D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
1831 CLG_(add_eventset)(D1r, use);
1832 off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
1833 off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
1834
1835 D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
1836 CLG_(add_eventset)(D1w, use);
1837 off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
1838 off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
1839
1840 D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
1841 CLG_(add_eventset)(D2, use);
1842 off_D2_Ir = CLG_(add_eventset)(D2, Ir);
1843 off_D2_Dr = CLG_(add_eventset)(D2, Dr);
1844 off_D2_Dw = CLG_(add_eventset)(D2, Dw);
1845
1846 sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
1847 CLG_(add_eventset)(sim, use);
1848 CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
1849 CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
1850 CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
1851
1852 if (CLG_(clo).collect_alloc) max_user += 2;
1853 if (CLG_(clo).collect_systime) max_user += 2;
1854
1855 full = CLG_(get_eventset)("full", sim->size + max_user);
1856 CLG_(add_eventset)(full, sim);
1857 CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
1858 CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
1859 CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
1860
1861 CLG_(sets).use = use;
1862 CLG_(sets).Ir = Ir;
1863 CLG_(sets).Dr = Dr;
1864 CLG_(sets).Dw = Dw;
1865
1866 CLG_(sets).D0 = D0;
1867 CLG_(sets).D1r = D1r;
1868 CLG_(sets).D1w = D1w;
1869 CLG_(sets).D2 = D2;
1870
1871 CLG_(sets).sim = sim;
1872 CLG_(sets).full = full;
1873
1874 if (CLG_(clo).collect_alloc) {
1875 e1 = CLG_(register_eventtype)("allocCount");
1876 e2 = CLG_(register_eventtype)("allocSize");
1877 CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
1878 }
1879
1880 if (CLG_(clo).collect_systime) {
1881 e1 = CLG_(register_eventtype)("sysCount");
1882 e2 = CLG_(register_eventtype)("sysTime");
1883 CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
1884 }
1885
1886 CLG_DEBUGIF(1) {
1887 CLG_DEBUG(1, "EventSets:\n");
1888 CLG_(print_eventset)(-2, use);
1889 CLG_(print_eventset)(-2, Ir);
1890 CLG_(print_eventset)(-2, Dr);
1891 CLG_(print_eventset)(-2, Dw);
1892 CLG_(print_eventset)(-2, sim);
1893 CLG_(print_eventset)(-2, full);
1894 }
1895
1896 /* Not-existing events are silently ignored */
1897 CLG_(dumpmap) = CLG_(get_eventmapping)(full);
1898 CLG_(append_event)(CLG_(dumpmap), "Ir");
1899 CLG_(append_event)(CLG_(dumpmap), "Dr");
1900 CLG_(append_event)(CLG_(dumpmap), "Dw");
1901 CLG_(append_event)(CLG_(dumpmap), "I1mr");
1902 CLG_(append_event)(CLG_(dumpmap), "D1mr");
1903 CLG_(append_event)(CLG_(dumpmap), "D1mw");
1904 CLG_(append_event)(CLG_(dumpmap), "I2mr");
1905 CLG_(append_event)(CLG_(dumpmap), "D2mr");
1906 CLG_(append_event)(CLG_(dumpmap), "D2mw");
1907 CLG_(append_event)(CLG_(dumpmap), "I2dmr");
1908 CLG_(append_event)(CLG_(dumpmap), "D2dmr");
1909 CLG_(append_event)(CLG_(dumpmap), "D2dmw");
1910 CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1911 CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1912 CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1913 CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1914 CLG_(append_event)(CLG_(dumpmap), "allocCount");
1915 CLG_(append_event)(CLG_(dumpmap), "allocSize");
1916 CLG_(append_event)(CLG_(dumpmap), "sysCount");
1917 CLG_(append_event)(CLG_(dumpmap), "sysTime");
1918
1919}
1920
1921
1922
1923static
1924void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
1925{
1926 /* if eventset use is defined, it is always first (hardcoded!) */
1927 CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
1928
1929 /* FIXME: This is hardcoded... */
1930 if (es == CLG_(sets).D0) {
1931 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1932 cost + off_D0_Ir);
1933 }
1934 else if (es == CLG_(sets).D1r) {
1935 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1936 cost + off_D1r_Ir);
1937 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
1938 cost + off_D1r_Dr);
1939 }
1940 else if (es == CLG_(sets).D1w) {
1941 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1942 cost + off_D1w_Ir);
1943 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
1944 cost + off_D1w_Dw);
1945 }
1946 else {
1947 CLG_ASSERT(es == CLG_(sets).D2);
1948 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1949 cost + off_D2_Ir);
1950 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
1951 cost + off_D2_Dr);
1952 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
1953 cost + off_D2_Dw);
1954 }
1955}
1956
1957/* this is called at dump time for every instruction executed */
1958static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1959 InstrInfo* ii, ULong exe_count)
1960{
1961 if (!CLG_(clo).simulate_cache)
1962 cost[CLG_(sets).off_sim_Ir] += exe_count;
1963 else {
1964
1965#if 0
1966/* There is always a trivial case where exe_count and Ir can be
1967 * slightly different because ecounter is updated when executing
1968 * the next BB. E.g. for last BB executed, or when toggling collection
1969 */
1970 /* FIXME: Hardcoded that each eventset has Ir as first */
1971 if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
1972 VG_(printf)("==> Ir %llu, exe %llu\n",
1973 (bbcc->cost + ii->cost_offset)[0], exe_count);
1974 CLG_(print_bbcc_cost)(-2, bbcc);
1975 //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
1976 }
1977#endif
1978
1979 add_and_zero_Dx(ii->eventset, cost,
1980 bbcc->cost + ii->cost_offset);
1981 }
1982}
1983
1984static
1985void cachesim_after_bbsetup(void)
1986{
1987 BBCC* bbcc = CLG_(current_state).bbcc;
1988
1989 if (CLG_(clo).simulate_cache) {
1990 BB* bb = bbcc->bb;
1991
1992 /* only needed if log_* functions are called */
1993 bb_base = bb->obj->offset + bb->offset;
1994 cost_base = bbcc->cost;
1995 }
1996}
1997
1998static
1999void cachesim_finish(void)
2000{
2001 if (clo_collect_cacheuse)
2002 cacheuse_finish();
2003}
2004
2005/*------------------------------------------------------------*/
2006/*--- The simulator defined in this file ---*/
2007/*------------------------------------------------------------*/
2008
2009struct cachesim_if CLG_(cachesim) = {
2010 .print_opts = cachesim_print_opts,
2011 .parse_opt = cachesim_parse_opt,
2012 .post_clo_init = cachesim_post_clo_init,
2013 .clear = cachesim_clear,
2014 .getdesc = cachesim_getdesc,
2015 .printstat = cachesim_printstat,
2016 .add_icost = cachesim_add_icost,
2017 .after_bbsetup = cachesim_after_bbsetup,
2018 .finish = cachesim_finish,
2019
2020 /* these will be set by cachesim_post_clo_init */
2021 .log_1I0D = 0,
2022
2023 .log_1I1Dr = 0,
2024 .log_1I1Dw = 0,
2025 .log_1I2D = 0,
2026
2027 .log_0I1Dr = 0,
2028 .log_0I1Dw = 0,
2029 .log_0I2D = 0,
2030
2031 .log_1I0D_name = "(no function)",
2032
2033 .log_1I1Dr_name = "(no function)",
2034 .log_1I1Dw_name = "(no function)",
2035 .log_1I2D_name = "(no function)",
2036
2037 .log_0I1Dr_name = "(no function)",
2038 .log_0I1Dw_name = "(no function)",
2039 .log_0I2D_name = "(no function)"
2040};
2041
2042
2043/*--------------------------------------------------------------------*/
2044/*--- end ct_sim.c ---*/
2045/*--------------------------------------------------------------------*/
2046