blob: 3d9ae6c2b7bdcb4be94c2709a241843c1fab235f [file] [log] [blame]
weidendoa17f2a32006-03-20 10:27:30 +00001
2/*--------------------------------------------------------------------*/
3/*--- Cache simulation. ---*/
4/*--- sim.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
njn9a0cba42007-04-15 22:15:57 +00008 This file is part of Callgrind, a Valgrind tool for call graph
9 profiling programs.
weidendoa17f2a32006-03-20 10:27:30 +000010
njn9a0cba42007-04-15 22:15:57 +000011 Copyright (C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
weidendoa17f2a32006-03-20 10:27:30 +000012
njn9a0cba42007-04-15 22:15:57 +000013 This tool is derived from and contains code from Cachegrind
sewardj4d474d02008-02-11 11:34:59 +000014 Copyright (C) 2002-2008 Nicholas Nethercote (njn@valgrind.org)
weidendoa17f2a32006-03-20 10:27:30 +000015
16 This program is free software; you can redistribute it and/or
17 modify it under the terms of the GNU General Public License as
18 published by the Free Software Foundation; either version 2 of the
19 License, or (at your option) any later version.
20
21 This program is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received a copy of the GNU General Public License
27 along with this program; if not, write to the Free Software
28 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 02111-1307, USA.
30
31 The GNU General Public License is contained in the file COPYING.
32*/
33
34#include "global.h"
35
36
37/* Notes:
38 - simulates a write-allocate cache
39 - (block --> set) hash function uses simple bit selection
40 - handling of references straddling two cache blocks:
41 - counts as only one cache access (not two)
42 - both blocks hit --> one hit
43 - one block hits, the other misses --> one miss
44 - both blocks miss --> one miss (not two)
45*/
46
47/* Cache configuration */
48#include "cg_arch.h"
49
50/* additional structures for cache use info, separated
51 * according usage frequency:
52 * - line_loaded : pointer to cost center of instruction
53 * which loaded the line into cache.
54 * Needed to increment counters when line is evicted.
55 * - line_use : updated on every access
56 */
57typedef struct {
58 UInt count;
59 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
60} line_use;
61
62typedef struct {
63 Addr memline, iaddr;
64 line_use* dep_use; /* point to higher-level cacheblock for this memline */
65 ULong* use_base;
66} line_loaded;
67
68/* Cache state */
69typedef struct {
70 char* name;
71 int size; /* bytes */
72 int assoc;
73 int line_size; /* bytes */
74 Bool sectored; /* prefetch nearside cacheline on read */
75 int sets;
76 int sets_min_1;
77 int assoc_bits;
78 int line_size_bits;
79 int tag_shift;
80 UWord tag_mask;
81 char desc_line[128];
82 UWord* tags;
83
84 /* for cache use */
85 int line_size_mask;
86 int* line_start_mask;
87 int* line_end_mask;
88 line_loaded* loaded;
89 line_use* use;
90} cache_t2;
91
92/*
93 * States of flat caches in our model.
94 * We use a 2-level hierarchy,
95 */
96static cache_t2 I1, D1, L2;
97
98/* Lower bits of cache tags are used as flags for a cache line */
99#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
100#define CACHELINE_DIRTY 1
101
102
103/* Cache simulator Options */
104static Bool clo_simulate_writeback = False;
105static Bool clo_simulate_hwpref = False;
106static Bool clo_simulate_sectors = False;
107static Bool clo_collect_cacheuse = False;
108
109/* Following global vars are setup before by
110 * setup_bbcc()/cachesim_after_bbsetup():
111 *
112 * - Addr bb_base (instruction start address of original BB)
113 * - ULong* cost_base (start of cost array for BB)
114 * - BBCC* nonskipped (only != 0 when in a function not skipped)
115 */
116
117/* Offset to events in event set, used in log_* functions */
118static Int off_D0_Ir;
119static Int off_D1r_Ir;
120static Int off_D1r_Dr;
121static Int off_D1w_Ir;
122static Int off_D1w_Dw;
123static Int off_D2_Ir;
124static Int off_D2_Dr;
125static Int off_D2_Dw;
126
127static Addr bb_base;
128static ULong* cost_base;
129static InstrInfo* current_ii;
130
131/* Cache use offsets */
132/* FIXME: The offsets are only correct because all eventsets get
133 * the "Use" set added first !
134 */
135static Int off_I1_AcCost = 0;
136static Int off_I1_SpLoss = 1;
137static Int off_D1_AcCost = 0;
138static Int off_D1_SpLoss = 1;
139static Int off_L2_AcCost = 2;
140static Int off_L2_SpLoss = 3;
141
142/* Cache access types */
143typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
144
145/* Result of a reference into a flat cache */
146typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
147
148/* Result of a reference into a hierarchical cache model */
149typedef enum {
150 L1_Hit,
151 L2_Hit,
152 MemAccess,
153 WriteBackMemAccess } CacheModelResult;
154
155typedef CacheModelResult (*simcall_type)(Addr, UChar);
156
157static struct {
158 simcall_type I1_Read;
159 simcall_type D1_Read;
160 simcall_type D1_Write;
161} simulator;
162
163/*------------------------------------------------------------*/
164/*--- Cache Simulator Initialization ---*/
165/*------------------------------------------------------------*/
166
167static void cachesim_clearcache(cache_t2* c)
168{
169 Int i;
170
171 for (i = 0; i < c->sets * c->assoc; i++)
172 c->tags[i] = 0;
173 if (c->use) {
174 for (i = 0; i < c->sets * c->assoc; i++) {
175 c->loaded[i].memline = 0;
176 c->loaded[i].use_base = 0;
177 c->loaded[i].dep_use = 0;
178 c->loaded[i].iaddr = 0;
179 c->use[i].mask = 0;
180 c->use[i].count = 0;
181 c->tags[i] = i % c->assoc; /* init lower bits as pointer */
182 }
183 }
184}
185
186static void cacheuse_initcache(cache_t2* c);
187
188/* By this point, the size/assoc/line_size has been checked. */
189static void cachesim_initcache(cache_t config, cache_t2* c)
190{
191 c->size = config.size;
192 c->assoc = config.assoc;
193 c->line_size = config.line_size;
194 c->sectored = False; // FIXME
195
196 c->sets = (c->size / c->line_size) / c->assoc;
197 c->sets_min_1 = c->sets - 1;
198 c->assoc_bits = VG_(log2)(c->assoc);
199 c->line_size_bits = VG_(log2)(c->line_size);
200 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
201 c->tag_mask = ~((1<<c->tag_shift)-1);
202
203 /* Can bits in tag entries be used for flags?
204 * Should be always true as MIN_LINE_SIZE >= 16 */
205 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
206
207 if (c->assoc == 1) {
208 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
209 c->size, c->line_size,
210 c->sectored ? ", sectored":"");
211 } else {
212 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
213 c->size, c->line_size, c->assoc,
214 c->sectored ? ", sectored":"");
215 }
216
217 c->tags = (UWord*) CLG_MALLOC(sizeof(UWord) * c->sets * c->assoc);
218 if (clo_collect_cacheuse)
219 cacheuse_initcache(c);
220 else
221 c->use = 0;
222 cachesim_clearcache(c);
223}
224
225
226#if 0
227static void print_cache(cache_t2* c)
228{
229 UInt set, way, i;
230
231 /* Note initialisation and update of 'i'. */
232 for (i = 0, set = 0; set < c->sets; set++) {
233 for (way = 0; way < c->assoc; way++, i++) {
234 VG_(printf)("%8x ", c->tags[i]);
235 }
236 VG_(printf)("\n");
237 }
238}
239#endif
240
241
242/*------------------------------------------------------------*/
243/*--- Write Through Cache Simulation ---*/
244/*------------------------------------------------------------*/
245
246/*
247 * Simple model: L1 & L2 Write Through
248 * Does not distinguish among read and write references
249 *
250 * Simulator functions:
251 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
252 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
253 */
254
255static __inline__
256CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
257{
258 int i, j;
259 UWord *set;
260
261 /* Shifting is a bit faster than multiplying */
262 set = &(c->tags[set_no << c->assoc_bits]);
263
264 /* This loop is unrolled for just the first case, which is the most */
265 /* common. We can't unroll any further because it would screw up */
266 /* if we have a direct-mapped (1-way) cache. */
267 if (tag == set[0])
268 return Hit;
269
270 /* If the tag is one other than the MRU, move it into the MRU spot */
271 /* and shuffle the rest down. */
272 for (i = 1; i < c->assoc; i++) {
273 if (tag == set[i]) {
274 for (j = i; j > 0; j--) {
275 set[j] = set[j - 1];
276 }
277 set[0] = tag;
278 return Hit;
279 }
280 }
281
282 /* A miss; install this tag as MRU, shuffle rest down. */
283 for (j = c->assoc - 1; j > 0; j--) {
284 set[j] = set[j - 1];
285 }
286 set[0] = tag;
287
288 return Miss;
289}
290
291static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
292{
293 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
294 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
295 UWord tag = a >> c->tag_shift;
296
297 /* Access entirely within line. */
298 if (set1 == set2)
299 return cachesim_setref(c, set1, tag);
300
301 /* Access straddles two lines. */
302 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
303 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000304 UWord tag2 = (a+size-1) >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000305
306 /* the call updates cache structures as side effect */
307 CacheResult res1 = cachesim_setref(c, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000308 CacheResult res2 = cachesim_setref(c, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000309 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
310
311 } else {
njn8a7b41b2007-09-23 00:51:24 +0000312 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendoa17f2a32006-03-20 10:27:30 +0000313 VG_(tool_panic)("item straddles more than two cache sets");
314 }
315 return Hit;
316}
317
318static
319CacheModelResult cachesim_I1_ref(Addr a, UChar size)
320{
321 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
322 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
323 return MemAccess;
324}
325
326static
327CacheModelResult cachesim_D1_ref(Addr a, UChar size)
328{
329 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
330 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
331 return MemAccess;
332}
333
334
335/*------------------------------------------------------------*/
336/*--- Write Back Cache Simulation ---*/
337/*------------------------------------------------------------*/
338
339/*
340 * More complex model: L1 Write-through, L2 Write-back
341 * This needs to distinguish among read and write references.
342 *
343 * Simulator functions:
344 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
345 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
346 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
347 */
348
349/*
350 * With write-back, result can be a miss evicting a dirty line
351 * The dirty state of a cache line is stored in Bit0 of the tag for
352 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
353 * type (Read/Write), the line gets dirty on a write.
354 */
355static __inline__
356CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
357{
358 int i, j;
359 UWord *set, tmp_tag;
360
361 /* Shifting is a bit faster than multiplying */
362 set = &(c->tags[set_no << c->assoc_bits]);
363
364 /* This loop is unrolled for just the first case, which is the most */
365 /* common. We can't unroll any further because it would screw up */
366 /* if we have a direct-mapped (1-way) cache. */
367 if (tag == (set[0] & ~CACHELINE_DIRTY)) {
368 set[0] |= ref;
369 return Hit;
370 }
371 /* If the tag is one other than the MRU, move it into the MRU spot */
372 /* and shuffle the rest down. */
373 for (i = 1; i < c->assoc; i++) {
374 if (tag == (set[i] & ~CACHELINE_DIRTY)) {
375 tmp_tag = set[i] | ref; // update dirty flag
376 for (j = i; j > 0; j--) {
377 set[j] = set[j - 1];
378 }
379 set[0] = tmp_tag;
380 return Hit;
381 }
382 }
383
384 /* A miss; install this tag as MRU, shuffle rest down. */
385 tmp_tag = set[c->assoc - 1];
386 for (j = c->assoc - 1; j > 0; j--) {
387 set[j] = set[j - 1];
388 }
389 set[0] = tag | ref;
390
391 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
392}
393
394
395static __inline__
396CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
397{
398 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
399 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
400 UWord tag = a & c->tag_mask;
401
402 /* Access entirely within line. */
403 if (set1 == set2)
404 return cachesim_setref_wb(c, ref, set1, tag);
405
406 /* Access straddles two lines. */
407 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
408 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000409 UWord tag2 = (a+size-1) >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000410
411 /* the call updates cache structures as side effect */
412 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000413 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000414
415 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
416 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
417
418 } else {
njn8a7b41b2007-09-23 00:51:24 +0000419 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendoa17f2a32006-03-20 10:27:30 +0000420 VG_(tool_panic)("item straddles more than two cache sets");
421 }
422 return Hit;
423}
424
425
426static
427CacheModelResult cachesim_I1_Read(Addr a, UChar size)
428{
429 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
430 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
431 case Hit: return L2_Hit;
432 case Miss: return MemAccess;
433 default: break;
434 }
435 return WriteBackMemAccess;
436}
437
438static
439CacheModelResult cachesim_D1_Read(Addr a, UChar size)
440{
441 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
442 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
443 case Hit: return L2_Hit;
444 case Miss: return MemAccess;
445 default: break;
446 }
447 return WriteBackMemAccess;
448}
449
450static
451CacheModelResult cachesim_D1_Write(Addr a, UChar size)
452{
453 if ( cachesim_ref( &D1, a, size) == Hit ) {
454 /* Even for a L1 hit, the write-trough L1 passes
455 * the write to the L2 to make the L2 line dirty.
456 * But this causes no latency, so return the hit.
457 */
458 cachesim_ref_wb( &L2, Write, a, size);
459 return L1_Hit;
460 }
461 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
462 case Hit: return L2_Hit;
463 case Miss: return MemAccess;
464 default: break;
465 }
466 return WriteBackMemAccess;
467}
468
469
470/*------------------------------------------------------------*/
471/*--- Hardware Prefetch Simulation ---*/
472/*------------------------------------------------------------*/
473
474static ULong prefetch_up = 0;
475static ULong prefetch_down = 0;
476
477#define PF_STREAMS 8
478#define PF_PAGEBITS 12
479
480static UInt pf_lastblock[PF_STREAMS];
481static Int pf_seqblocks[PF_STREAMS];
482
483static
484void prefetch_clear(void)
485{
486 int i;
487 for(i=0;i<PF_STREAMS;i++)
488 pf_lastblock[i] = pf_seqblocks[i] = 0;
489}
490
491/*
492 * HW Prefetch emulation
493 * Start prefetching when detecting sequential access to 3 memory blocks.
494 * One stream can be detected per 4k page.
495 */
496static __inline__
497void prefetch_L2_doref(Addr a, UChar size)
498{
499 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
500 UInt block = ( a >> L2.line_size_bits);
501
502 if (block != pf_lastblock[stream]) {
503 if (pf_seqblocks[stream] == 0) {
504 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
505 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
506 }
507 else if (pf_seqblocks[stream] >0) {
508 if (pf_lastblock[stream] +1 == block) {
509 pf_seqblocks[stream]++;
510 if (pf_seqblocks[stream] >= 2) {
511 prefetch_up++;
512 cachesim_ref(&L2, a + 5 * L2.line_size,1);
513 }
514 }
515 else pf_seqblocks[stream] = 0;
516 }
517 else if (pf_seqblocks[stream] <0) {
518 if (pf_lastblock[stream] -1 == block) {
519 pf_seqblocks[stream]--;
520 if (pf_seqblocks[stream] <= -2) {
521 prefetch_down++;
522 cachesim_ref(&L2, a - 5 * L2.line_size,1);
523 }
524 }
525 else pf_seqblocks[stream] = 0;
526 }
527 pf_lastblock[stream] = block;
528 }
529}
530
531/* simple model with hardware prefetch */
532
533static
534CacheModelResult prefetch_I1_ref(Addr a, UChar size)
535{
536 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
537 prefetch_L2_doref(a,size);
538 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
539 return MemAccess;
540}
541
542static
543CacheModelResult prefetch_D1_ref(Addr a, UChar size)
544{
545 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
546 prefetch_L2_doref(a,size);
547 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
548 return MemAccess;
549}
550
551
552/* complex model with hardware prefetch */
553
554static
555CacheModelResult prefetch_I1_Read(Addr a, UChar size)
556{
557 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
558 prefetch_L2_doref(a,size);
559 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
560 case Hit: return L2_Hit;
561 case Miss: return MemAccess;
562 default: break;
563 }
564 return WriteBackMemAccess;
565}
566
567static
568CacheModelResult prefetch_D1_Read(Addr a, UChar size)
569{
570 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
571 prefetch_L2_doref(a,size);
572 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
573 case Hit: return L2_Hit;
574 case Miss: return MemAccess;
575 default: break;
576 }
577 return WriteBackMemAccess;
578}
579
580static
581CacheModelResult prefetch_D1_Write(Addr a, UChar size)
582{
583 prefetch_L2_doref(a,size);
584 if ( cachesim_ref( &D1, a, size) == Hit ) {
585 /* Even for a L1 hit, the write-trough L1 passes
586 * the write to the L2 to make the L2 line dirty.
587 * But this causes no latency, so return the hit.
588 */
589 cachesim_ref_wb( &L2, Write, a, size);
590 return L1_Hit;
591 }
592 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
593 case Hit: return L2_Hit;
594 case Miss: return MemAccess;
595 default: break;
596 }
597 return WriteBackMemAccess;
598}
599
600
601/*------------------------------------------------------------*/
602/*--- Cache Simulation with use metric collection ---*/
603/*------------------------------------------------------------*/
604
605/* can not be combined with write-back or prefetch */
606
607static
608void cacheuse_initcache(cache_t2* c)
609{
610 int i;
611 unsigned int start_mask, start_val;
612 unsigned int end_mask, end_val;
613
614 c->use = CLG_MALLOC(sizeof(line_use) * c->sets * c->assoc);
615 c->loaded = CLG_MALLOC(sizeof(line_loaded) * c->sets * c->assoc);
616 c->line_start_mask = CLG_MALLOC(sizeof(int) * c->line_size);
617 c->line_end_mask = CLG_MALLOC(sizeof(int) * c->line_size);
618
619
620 c->line_size_mask = c->line_size-1;
621
622 /* Meaning of line_start_mask/line_end_mask
623 * Example: for a given cache line, you get an access starting at
624 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
625 * line size of 32, you have 1 bit per byte in the mask:
626 *
627 * bit31 bit8 bit5 bit 0
628 * | | | |
629 * 11..111111100000 line_start_mask[5]
630 * 00..000111111111 line_end_mask[(5+4)-1]
631 *
632 * use_mask |= line_start_mask[5] && line_end_mask[8]
633 *
634 */
635 start_val = end_val = ~0;
636 if (c->line_size < 32) {
637 int bits_per_byte = 32/c->line_size;
638 start_mask = (1<<bits_per_byte)-1;
639 end_mask = start_mask << (32-bits_per_byte);
640 for(i=0;i<c->line_size;i++) {
641 c->line_start_mask[i] = start_val;
642 start_val = start_val & ~start_mask;
643 start_mask = start_mask << bits_per_byte;
644
645 c->line_end_mask[c->line_size-i-1] = end_val;
646 end_val = end_val & ~end_mask;
647 end_mask = end_mask >> bits_per_byte;
648 }
649 }
650 else {
651 int bytes_per_bit = c->line_size/32;
652 start_mask = 1;
653 end_mask = 1 << 31;
654 for(i=0;i<c->line_size;i++) {
655 c->line_start_mask[i] = start_val;
656 c->line_end_mask[c->line_size-i-1] = end_val;
657 if ( ((i+1)%bytes_per_bit) == 0) {
658 start_val &= ~start_mask;
659 end_val &= ~end_mask;
660 start_mask <<= 1;
661 end_mask >>= 1;
662 }
663 }
664 }
665
666 CLG_DEBUG(6, "Config %s:\n", c->desc_line);
667 for(i=0;i<c->line_size;i++) {
668 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
669 i, c->line_start_mask[i], c->line_end_mask[i]);
670 }
671
672 /* We use lower tag bits as offset pointers to cache use info.
673 * I.e. some cache parameters don't work.
674 */
675 if (c->tag_shift < c->assoc_bits) {
676 VG_(message)(Vg_DebugMsg,
677 "error: Use associativity < %d for cache use statistics!",
678 (1<<c->tag_shift) );
679 VG_(tool_panic)("Unsupported cache configuration");
680 }
681}
682
683/* FIXME: A little tricky */
684#if 0
685
686static __inline__
687void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
688{
689 int idx = (high_idx << c->assoc_bits) | low_idx;
690
691 c->use[idx].count ++;
692 c->use[idx].mask |= use_mask;
693
barta0b6b2c2008-07-07 06:49:24 +0000694 CLG_DEBUG(6," Hit [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +0000695 idx, c->loaded[idx].memline, c->loaded[idx].iaddr,
696 use_mask, c->use[idx].mask, c->use[idx].count);
697}
698
699/* only used for I1, D1 */
700
701static __inline__
702CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
703{
704 int i, j, idx;
705 UWord *set, tmp_tag;
706 UInt use_mask;
707
708 /* Shifting is a bit faster than multiplying */
709 set = &(c->tags[set_no << c->assoc_bits]);
710 use_mask =
711 c->line_start_mask[a & c->line_size_mask] &
712 c->line_end_mask[(a+size-1) & c->line_size_mask];
713
714 /* This loop is unrolled for just the first case, which is the most */
715 /* common. We can't unroll any further because it would screw up */
716 /* if we have a direct-mapped (1-way) cache. */
717 if (tag == (set[0] & c->tag_mask)) {
718 cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
719 return L1_Hit;
720 }
721
722 /* If the tag is one other than the MRU, move it into the MRU spot */
723 /* and shuffle the rest down. */
724 for (i = 1; i < c->assoc; i++) {
725 if (tag == (set[i] & c->tag_mask)) {
726 tmp_tag = set[i];
727 for (j = i; j > 0; j--) {
728 set[j] = set[j - 1];
729 }
730 set[0] = tmp_tag;
731
732 cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
733 return L1_Hit;
734 }
735 }
736
737 /* A miss; install this tag as MRU, shuffle rest down. */
738 tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
739 for (j = c->assoc - 1; j > 0; j--) {
740 set[j] = set[j - 1];
741 }
742 set[0] = tag | tmp_tag;
743
744 cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
745 use_mask, a & ~c->line_size_mask);
746
747 return Miss;
748}
749
750
751static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
752{
753 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
754 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
755 UWord tag = a >> c->tag_shift;
756
757 /* Access entirely within line. */
758 if (set1 == set2)
759 return cacheuse_setref(c, set1, tag);
760
761 /* Access straddles two lines. */
762 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
763 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000764 UWord tag2 = a >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000765
766 /* the call updates cache structures as side effect */
767 CacheResult res1 = cacheuse_isMiss(c, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000768 CacheResult res2 = cacheuse_isMiss(c, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000769 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
770
771 } else {
772 VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
773 VG_(tool_panic)("item straddles more than two cache sets");
774 }
775 return Hit;
776}
777#endif
778
779
780/* for I1/D1 caches */
781#define CACHEUSE(L) \
782 \
783static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
784{ \
weidendo28e2a142006-11-22 21:00:53 +0000785 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
786 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
787 UWord tag = a & L.tag_mask; \
788 UWord tag2; \
weidendoa17f2a32006-03-20 10:27:30 +0000789 int i, j, idx; \
790 UWord *set, tmp_tag; \
791 UInt use_mask; \
792 \
barta0b6b2c2008-07-07 06:49:24 +0000793 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \
weidendoa17f2a32006-03-20 10:27:30 +0000794 L.name, a, size, set1, set2); \
795 \
796 /* First case: word entirely within line. */ \
797 if (set1 == set2) { \
798 \
799 /* Shifting is a bit faster than multiplying */ \
800 set = &(L.tags[set1 << L.assoc_bits]); \
801 use_mask = L.line_start_mask[a & L.line_size_mask] & \
802 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
803 \
804 /* This loop is unrolled for just the first case, which is the most */\
805 /* common. We can't unroll any further because it would screw up */\
806 /* if we have a direct-mapped (1-way) cache. */\
807 if (tag == (set[0] & L.tag_mask)) { \
808 idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
809 L.use[idx].count ++; \
810 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000811 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000812 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
813 use_mask, L.use[idx].mask, L.use[idx].count); \
814 return L1_Hit; \
815 } \
816 /* If the tag is one other than the MRU, move it into the MRU spot */\
817 /* and shuffle the rest down. */\
818 for (i = 1; i < L.assoc; i++) { \
819 if (tag == (set[i] & L.tag_mask)) { \
820 tmp_tag = set[i]; \
821 for (j = i; j > 0; j--) { \
822 set[j] = set[j - 1]; \
823 } \
824 set[0] = tmp_tag; \
825 idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
826 L.use[idx].count ++; \
827 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000828 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000829 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
830 use_mask, L.use[idx].mask, L.use[idx].count); \
831 return L1_Hit; \
832 } \
833 } \
834 \
835 /* A miss; install this tag as MRU, shuffle rest down. */ \
836 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
837 for (j = L.assoc - 1; j > 0; j--) { \
838 set[j] = set[j - 1]; \
839 } \
840 set[0] = tag | tmp_tag; \
841 idx = (set1 << L.assoc_bits) | tmp_tag; \
842 return update_##L##_use(&L, idx, \
843 use_mask, a &~ L.line_size_mask); \
844 \
845 /* Second case: word straddles two lines. */ \
846 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
847 } else if (((set1 + 1) & (L.sets-1)) == set2) { \
848 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
849 set = &(L.tags[set1 << L.assoc_bits]); \
850 use_mask = L.line_start_mask[a & L.line_size_mask]; \
851 if (tag == (set[0] & L.tag_mask)) { \
852 idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
853 L.use[idx].count ++; \
854 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000855 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000856 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
857 use_mask, L.use[idx].mask, L.use[idx].count); \
858 goto block2; \
859 } \
860 for (i = 1; i < L.assoc; i++) { \
861 if (tag == (set[i] & L.tag_mask)) { \
862 tmp_tag = set[i]; \
863 for (j = i; j > 0; j--) { \
864 set[j] = set[j - 1]; \
865 } \
866 set[0] = tmp_tag; \
867 idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
868 L.use[idx].count ++; \
869 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000870 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000871 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
872 use_mask, L.use[idx].mask, L.use[idx].count); \
873 goto block2; \
874 } \
875 } \
876 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
877 for (j = L.assoc - 1; j > 0; j--) { \
878 set[j] = set[j - 1]; \
879 } \
880 set[0] = tag | tmp_tag; \
881 idx = (set1 << L.assoc_bits) | tmp_tag; \
882 miss1 = update_##L##_use(&L, idx, \
883 use_mask, a &~ L.line_size_mask); \
884block2: \
885 set = &(L.tags[set2 << L.assoc_bits]); \
886 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
weidendo28e2a142006-11-22 21:00:53 +0000887 tag2 = (a+size-1) & L.tag_mask; \
888 if (tag2 == (set[0] & L.tag_mask)) { \
weidendoa17f2a32006-03-20 10:27:30 +0000889 idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
890 L.use[idx].count ++; \
891 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000892 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000893 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
894 use_mask, L.use[idx].mask, L.use[idx].count); \
895 return miss1; \
896 } \
897 for (i = 1; i < L.assoc; i++) { \
weidendo28e2a142006-11-22 21:00:53 +0000898 if (tag2 == (set[i] & L.tag_mask)) { \
weidendoa17f2a32006-03-20 10:27:30 +0000899 tmp_tag = set[i]; \
900 for (j = i; j > 0; j--) { \
901 set[j] = set[j - 1]; \
902 } \
903 set[0] = tmp_tag; \
904 idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
905 L.use[idx].count ++; \
906 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000907 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000908 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
909 use_mask, L.use[idx].mask, L.use[idx].count); \
910 return miss1; \
911 } \
912 } \
913 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
914 for (j = L.assoc - 1; j > 0; j--) { \
915 set[j] = set[j - 1]; \
916 } \
weidendo28e2a142006-11-22 21:00:53 +0000917 set[0] = tag2 | tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000918 idx = (set2 << L.assoc_bits) | tmp_tag; \
919 miss2 = update_##L##_use(&L, idx, \
920 use_mask, (a+size-1) &~ L.line_size_mask); \
921 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
922 \
923 } else { \
barta0b6b2c2008-07-07 06:49:24 +0000924 VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
weidendoa17f2a32006-03-20 10:27:30 +0000925 VG_(tool_panic)("item straddles more than two cache sets"); \
926 } \
927 return 0; \
928}
929
930
931/* logarithmic bitcounting algorithm, see
932 * http://graphics.stanford.edu/~seander/bithacks.html
933 */
934static __inline__ unsigned int countBits(unsigned int bits)
935{
936 unsigned int c; // store the total here
937 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
938 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
939
940 c = bits;
941 c = ((c >> S[0]) & B[0]) + (c & B[0]);
942 c = ((c >> S[1]) & B[1]) + (c & B[1]);
943 c = ((c >> S[2]) & B[2]) + (c & B[2]);
944 c = ((c >> S[3]) & B[3]) + (c & B[3]);
945 c = ((c >> S[4]) & B[4]) + (c & B[4]);
946 return c;
947}
948
949static void update_L2_use(int idx, Addr memline)
950{
951 line_loaded* loaded = &(L2.loaded[idx]);
952 line_use* use = &(L2.use[idx]);
953 int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
954
barta0b6b2c2008-07-07 06:49:24 +0000955 CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
weidendoa17f2a32006-03-20 10:27:30 +0000956 idx, bb_base + current_ii->instr_offset, memline);
957 if (use->count>0) {
barta0b6b2c2008-07-07 06:49:24 +0000958 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
weidendoa17f2a32006-03-20 10:27:30 +0000959 use->count, i, use->mask, loaded->memline, loaded->iaddr);
960 CLG_DEBUG(2, " collect: %d, use_base %p\n",
961 CLG_(current_state).collect, loaded->use_base);
962
963 if (CLG_(current_state).collect && loaded->use_base) {
964 (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
965 (loaded->use_base)[off_L2_SpLoss] += i;
966 }
967 }
968
969 use->count = 0;
970 use->mask = 0;
971
972 loaded->memline = memline;
973 loaded->iaddr = bb_base + current_ii->instr_offset;
974 loaded->use_base = (CLG_(current_state).nonskipped) ?
975 CLG_(current_state).nonskipped->skipped :
976 cost_base + current_ii->cost_offset;
977}
978
979static
980CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
981{
982 UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
983 UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
984 UWord tag = memline & L2.tag_mask;
985
986 int i, j, idx;
987 UWord tmp_tag;
988
barta0b6b2c2008-07-07 06:49:24 +0000989 CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
weidendoa17f2a32006-03-20 10:27:30 +0000990
991 if (tag == (set[0] & L2.tag_mask)) {
992 idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
993 l1_loaded->dep_use = &(L2.use[idx]);
994
barta0b6b2c2008-07-07 06:49:24 +0000995 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +0000996 idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
997 L2.use[idx].mask, L2.use[idx].count);
998 return L2_Hit;
999 }
1000 for (i = 1; i < L2.assoc; i++) {
1001 if (tag == (set[i] & L2.tag_mask)) {
1002 tmp_tag = set[i];
1003 for (j = i; j > 0; j--) {
1004 set[j] = set[j - 1];
1005 }
1006 set[0] = tmp_tag;
1007 idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
1008 l1_loaded->dep_use = &(L2.use[idx]);
1009
barta0b6b2c2008-07-07 06:49:24 +00001010 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001011 i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
1012 L2.use[idx].mask, L2.use[idx].count);
1013 return L2_Hit;
1014 }
1015 }
1016
1017 /* A miss; install this tag as MRU, shuffle rest down. */
1018 tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
1019 for (j = L2.assoc - 1; j > 0; j--) {
1020 set[j] = set[j - 1];
1021 }
1022 set[0] = tag | tmp_tag;
1023 idx = (setNo << L2.assoc_bits) | tmp_tag;
1024 l1_loaded->dep_use = &(L2.use[idx]);
1025
1026 update_L2_use(idx, memline);
1027
1028 return MemAccess;
1029}
1030
1031
1032
1033
1034#define UPDATE_USE(L) \
1035 \
1036static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
1037 UInt mask, Addr memline) \
1038{ \
1039 line_loaded* loaded = &(cache->loaded[idx]); \
1040 line_use* use = &(cache->use[idx]); \
1041 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
1042 \
barta0b6b2c2008-07-07 06:49:24 +00001043 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
weidendoa17f2a32006-03-20 10:27:30 +00001044 cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
1045 if (use->count>0) { \
barta0b6b2c2008-07-07 06:49:24 +00001046 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
weidendoa17f2a32006-03-20 10:27:30 +00001047 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
1048 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
1049 CLG_(current_state).collect, loaded->use_base); \
1050 \
1051 if (CLG_(current_state).collect && loaded->use_base) { \
1052 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
1053 (loaded->use_base)[off_##L##_SpLoss] += c; \
1054 \
1055 /* FIXME (?): L1/L2 line sizes must be equal ! */ \
1056 loaded->dep_use->mask |= use->mask; \
1057 loaded->dep_use->count += use->count; \
1058 } \
1059 } \
1060 \
1061 use->count = 1; \
1062 use->mask = mask; \
1063 loaded->memline = memline; \
1064 loaded->iaddr = bb_base + current_ii->instr_offset; \
1065 loaded->use_base = (CLG_(current_state).nonskipped) ? \
1066 CLG_(current_state).nonskipped->skipped : \
1067 cost_base + current_ii->cost_offset; \
1068 \
1069 if (memline == 0) return L2_Hit; \
1070 return cacheuse_L2_access(memline, loaded); \
1071}
1072
1073UPDATE_USE(I1);
1074UPDATE_USE(D1);
1075
1076CACHEUSE(I1);
1077CACHEUSE(D1);
1078
1079
1080static
1081void cacheuse_finish(void)
1082{
1083 int i;
1084 InstrInfo ii = { 0,0,0,0,0 };
1085
1086 if (!CLG_(current_state).collect) return;
1087
1088 bb_base = 0;
1089 current_ii = &ii;
1090 cost_base = 0;
1091
1092 /* update usage counters */
1093 if (I1.use)
1094 for (i = 0; i < I1.sets * I1.assoc; i++)
1095 if (I1.loaded[i].use_base)
1096 update_I1_use( &I1, i, 0,0);
1097
1098 if (D1.use)
1099 for (i = 0; i < D1.sets * D1.assoc; i++)
1100 if (D1.loaded[i].use_base)
1101 update_D1_use( &D1, i, 0,0);
1102
1103 if (L2.use)
1104 for (i = 0; i < L2.sets * L2.assoc; i++)
1105 if (L2.loaded[i].use_base)
1106 update_L2_use(i, 0);
1107}
1108
1109
1110
1111/*------------------------------------------------------------*/
1112/*--- Helper functions called by instrumented code ---*/
1113/*------------------------------------------------------------*/
1114
1115
1116static __inline__
1117void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1118{
1119 switch(r) {
1120 case WriteBackMemAccess:
1121 if (clo_simulate_writeback) {
1122 c1[3]++;
1123 c2[3]++;
1124 }
1125 // fall through
1126
1127 case MemAccess:
1128 c1[2]++;
1129 c2[2]++;
1130 // fall through
1131
1132 case L2_Hit:
1133 c1[1]++;
1134 c2[1]++;
1135 // fall through
1136
1137 default:
1138 c1[0]++;
1139 c2[0]++;
1140 }
1141}
1142
1143
1144VG_REGPARM(1)
1145static void log_1I0D(InstrInfo* ii)
1146{
1147 CacheModelResult IrRes;
1148
1149 current_ii = ii;
1150 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1151
barta0b6b2c2008-07-07 06:49:24 +00001152 CLG_DEBUG(6, "log_1I0D: Ir=%#lx/%u => Ir %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001153 bb_base + ii->instr_offset, ii->instr_size, IrRes);
1154
1155 if (CLG_(current_state).collect) {
1156 ULong* cost_Ir;
1157
1158 if (CLG_(current_state).nonskipped)
1159 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1160 else
1161 cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
1162
1163 inc_costs(IrRes, cost_Ir,
1164 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1165 }
1166}
1167
1168
1169/* Instruction doing a read access */
1170
1171VG_REGPARM(2)
1172static void log_1I1Dr(InstrInfo* ii, Addr data)
1173{
1174 CacheModelResult IrRes, DrRes;
1175
1176 current_ii = ii;
1177 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1178 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1179
barta0b6b2c2008-07-07 06:49:24 +00001180 CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001181 bb_base + ii->instr_offset, ii->instr_size,
1182 data, ii->data_size, IrRes, DrRes);
1183
1184 if (CLG_(current_state).collect) {
1185 ULong *cost_Ir, *cost_Dr;
1186
1187 if (CLG_(current_state).nonskipped) {
1188 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1189 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1190 }
1191 else {
1192 cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
1193 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1194 }
1195
1196 inc_costs(IrRes, cost_Ir,
1197 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1198 inc_costs(DrRes, cost_Dr,
1199 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1200 }
1201}
1202
1203
1204VG_REGPARM(2)
1205static void log_0I1Dr(InstrInfo* ii, Addr data)
1206{
1207 CacheModelResult DrRes;
1208
1209 current_ii = ii;
1210 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1211
barta0b6b2c2008-07-07 06:49:24 +00001212 CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001213 data, ii->data_size, DrRes);
1214
1215 if (CLG_(current_state).collect) {
1216 ULong *cost_Dr;
1217
1218 if (CLG_(current_state).nonskipped) {
1219 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1220 }
1221 else {
1222 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1223 }
1224
1225 inc_costs(DrRes, cost_Dr,
1226 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1227 }
1228}
1229
1230
1231/* Instruction doing a write access */
1232
1233VG_REGPARM(2)
1234static void log_1I1Dw(InstrInfo* ii, Addr data)
1235{
1236 CacheModelResult IrRes, DwRes;
1237
1238 current_ii = ii;
1239 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1240 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1241
barta0b6b2c2008-07-07 06:49:24 +00001242 CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001243 bb_base + ii->instr_offset, ii->instr_size,
1244 data, ii->data_size, IrRes, DwRes);
1245
1246 if (CLG_(current_state).collect) {
1247 ULong *cost_Ir, *cost_Dw;
1248
1249 if (CLG_(current_state).nonskipped) {
1250 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1251 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1252 }
1253 else {
1254 cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
1255 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1256 }
1257
1258 inc_costs(IrRes, cost_Ir,
1259 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1260 inc_costs(DwRes, cost_Dw,
1261 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1262 }
1263}
1264
1265VG_REGPARM(2)
1266static void log_0I1Dw(InstrInfo* ii, Addr data)
1267{
1268 CacheModelResult DwRes;
1269
1270 current_ii = ii;
1271 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1272
barta0b6b2c2008-07-07 06:49:24 +00001273 CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001274 data, ii->data_size, DwRes);
1275
1276 if (CLG_(current_state).collect) {
1277 ULong *cost_Dw;
1278
1279 if (CLG_(current_state).nonskipped) {
1280 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1281 }
1282 else {
1283 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1284 }
1285
1286 inc_costs(DwRes, cost_Dw,
1287 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1288 }
1289}
1290
1291/* Instruction doing a read and a write access */
1292
1293VG_REGPARM(3)
1294static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
1295{
1296 CacheModelResult IrRes, DrRes, DwRes;
1297
1298 current_ii = ii;
1299 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1300 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1301 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1302
1303 CLG_DEBUG(6,
barta0b6b2c2008-07-07 06:49:24 +00001304 "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001305 bb_base + ii->instr_offset, ii->instr_size,
1306 data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
1307
1308 if (CLG_(current_state).collect) {
1309 ULong *cost_Ir, *cost_Dr, *cost_Dw;
1310
1311 if (CLG_(current_state).nonskipped) {
1312 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1313 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1314 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1315 }
1316 else {
1317 cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
1318 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1319 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1320 }
1321
1322 inc_costs(IrRes, cost_Ir,
1323 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1324 inc_costs(DrRes, cost_Dr,
1325 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1326 inc_costs(DwRes, cost_Dw,
1327 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1328 }
1329}
1330
1331VG_REGPARM(3)
1332static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
1333{
1334 CacheModelResult DrRes, DwRes;
1335
1336 current_ii = ii;
1337 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1338 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1339
1340 CLG_DEBUG(6,
barta0b6b2c2008-07-07 06:49:24 +00001341 "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001342 data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
1343
1344 if (CLG_(current_state).collect) {
1345 ULong *cost_Dr, *cost_Dw;
1346
1347 if (CLG_(current_state).nonskipped) {
1348 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1349 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1350 }
1351 else {
1352 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1353 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1354 }
1355
1356 inc_costs(DrRes, cost_Dr,
1357 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1358 inc_costs(DwRes, cost_Dw,
1359 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1360 }
1361}
1362
1363
1364/*------------------------------------------------------------*/
1365/*--- Cache configuration ---*/
1366/*------------------------------------------------------------*/
1367
1368#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
1369
1370static cache_t clo_I1_cache = UNDEFINED_CACHE;
1371static cache_t clo_D1_cache = UNDEFINED_CACHE;
1372static cache_t clo_L2_cache = UNDEFINED_CACHE;
1373
1374
1375/* Checks cache config is ok; makes it so if not. */
1376static
1377void check_cache(cache_t* cache, Char *name)
1378{
1379 /* First check they're all powers of two */
1380 if (-1 == VG_(log2)(cache->size)) {
1381 VG_(message)(Vg_UserMsg,
1382 "error: %s size of %dB not a power of two; aborting.",
1383 name, cache->size);
1384 VG_(exit)(1);
1385 }
1386
1387 if (-1 == VG_(log2)(cache->assoc)) {
1388 VG_(message)(Vg_UserMsg,
1389 "error: %s associativity of %d not a power of two; aborting.",
1390 name, cache->assoc);
1391 VG_(exit)(1);
1392 }
1393
1394 if (-1 == VG_(log2)(cache->line_size)) {
1395 VG_(message)(Vg_UserMsg,
1396 "error: %s line size of %dB not a power of two; aborting.",
1397 name, cache->line_size);
1398 VG_(exit)(1);
1399 }
1400
1401 // Then check line size >= 16 -- any smaller and a single instruction could
1402 // straddle three cache lines, which breaks a simulation assertion and is
1403 // stupid anyway.
1404 if (cache->line_size < MIN_LINE_SIZE) {
1405 VG_(message)(Vg_UserMsg,
1406 "error: %s line size of %dB too small; aborting.",
1407 name, cache->line_size);
1408 VG_(exit)(1);
1409 }
1410
1411 /* Then check cache size > line size (causes seg faults if not). */
1412 if (cache->size <= cache->line_size) {
1413 VG_(message)(Vg_UserMsg,
1414 "error: %s cache size of %dB <= line size of %dB; aborting.",
1415 name, cache->size, cache->line_size);
1416 VG_(exit)(1);
1417 }
1418
1419 /* Then check assoc <= (size / line size) (seg faults otherwise). */
1420 if (cache->assoc > (cache->size / cache->line_size)) {
1421 VG_(message)(Vg_UserMsg,
1422 "warning: %s associativity > (size / line size); aborting.", name);
1423 VG_(exit)(1);
1424 }
1425}
1426
1427static
1428void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1429{
1430#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
1431
1432 Int n_clos = 0;
1433
1434 // Count how many were defined on the command line.
1435 if (DEFINED(clo_I1_cache)) { n_clos++; }
1436 if (DEFINED(clo_D1_cache)) { n_clos++; }
1437 if (DEFINED(clo_L2_cache)) { n_clos++; }
1438
1439 // Set the cache config (using auto-detection, if supported by the
1440 // architecture)
1441 VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1442
1443 // Then replace with any defined on the command line.
1444 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1445 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1446 if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1447
1448 // Then check values and fix if not acceptable.
1449 check_cache(I1c, "I1");
1450 check_cache(D1c, "D1");
1451 check_cache(L2c, "L2");
1452
1453 if (VG_(clo_verbosity) > 1) {
1454 VG_(message)(Vg_UserMsg, "Cache configuration used:");
1455 VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
1456 I1c->size, I1c->assoc, I1c->line_size);
1457 VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
1458 D1c->size, D1c->assoc, D1c->line_size);
1459 VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
1460 L2c->size, L2c->assoc, L2c->line_size);
1461 }
1462#undef CMD_LINE_DEFINED
1463}
1464
1465
1466/* Initialize and clear simulator state */
1467static void cachesim_post_clo_init(void)
1468{
1469 /* Cache configurations. */
1470 cache_t I1c, D1c, L2c;
1471
1472 /* Initialize access handlers */
1473 if (!CLG_(clo).simulate_cache) {
1474 CLG_(cachesim).log_1I0D = 0;
1475 CLG_(cachesim).log_1I0D_name = "(no function)";
1476
1477 CLG_(cachesim).log_1I1Dr = 0;
1478 CLG_(cachesim).log_1I1Dw = 0;
1479 CLG_(cachesim).log_1I2D = 0;
1480 CLG_(cachesim).log_1I1Dr_name = "(no function)";
1481 CLG_(cachesim).log_1I1Dw_name = "(no function)";
1482 CLG_(cachesim).log_1I2D_name = "(no function)";
1483
1484 CLG_(cachesim).log_0I1Dr = 0;
1485 CLG_(cachesim).log_0I1Dw = 0;
1486 CLG_(cachesim).log_0I2D = 0;
1487 CLG_(cachesim).log_0I1Dr_name = "(no function)";
1488 CLG_(cachesim).log_0I1Dw_name = "(no function)";
1489 CLG_(cachesim).log_0I2D_name = "(no function)";
1490 return;
1491 }
1492
1493 /* Configuration of caches only needed with real cache simulation */
1494 configure_caches(&I1c, &D1c, &L2c);
1495
1496 I1.name = "I1";
1497 D1.name = "D1";
1498 L2.name = "L2";
1499
1500 cachesim_initcache(I1c, &I1);
1501 cachesim_initcache(D1c, &D1);
1502 cachesim_initcache(L2c, &L2);
1503
1504 /* the other cache simulators use the standard helpers
1505 * with dispatching via simulator struct */
1506
1507 CLG_(cachesim).log_1I0D = log_1I0D;
1508 CLG_(cachesim).log_1I0D_name = "log_1I0D";
1509
1510 CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1511 CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1512 CLG_(cachesim).log_1I2D = log_1I2D;
1513 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1514 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1515 CLG_(cachesim).log_1I2D_name = "log_1I2D";
1516
1517 CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1518 CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1519 CLG_(cachesim).log_0I2D = log_0I2D;
1520 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1521 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1522 CLG_(cachesim).log_0I2D_name = "log_0I2D";
1523
1524 if (clo_collect_cacheuse) {
1525
1526 /* Output warning for not supported option combinations */
1527 if (clo_simulate_hwpref) {
1528 VG_(message)(Vg_DebugMsg,
1529 "warning: prefetch simulation can not be used with cache usage");
1530 clo_simulate_hwpref = False;
1531 }
1532
1533 if (clo_simulate_writeback) {
1534 VG_(message)(Vg_DebugMsg,
1535 "warning: write-back simulation can not be used with cache usage");
1536 clo_simulate_writeback = False;
1537 }
1538
1539 simulator.I1_Read = cacheuse_I1_doRead;
1540 simulator.D1_Read = cacheuse_D1_doRead;
1541 simulator.D1_Write = cacheuse_D1_doRead;
1542 return;
1543 }
1544
1545 if (clo_simulate_hwpref) {
1546 prefetch_clear();
1547
1548 if (clo_simulate_writeback) {
1549 simulator.I1_Read = prefetch_I1_Read;
1550 simulator.D1_Read = prefetch_D1_Read;
1551 simulator.D1_Write = prefetch_D1_Write;
1552 }
1553 else {
1554 simulator.I1_Read = prefetch_I1_ref;
1555 simulator.D1_Read = prefetch_D1_ref;
1556 simulator.D1_Write = prefetch_D1_ref;
1557 }
1558
1559 return;
1560 }
1561
1562 if (clo_simulate_writeback) {
1563 simulator.I1_Read = cachesim_I1_Read;
1564 simulator.D1_Read = cachesim_D1_Read;
1565 simulator.D1_Write = cachesim_D1_Write;
1566 }
1567 else {
1568 simulator.I1_Read = cachesim_I1_ref;
1569 simulator.D1_Read = cachesim_D1_ref;
1570 simulator.D1_Write = cachesim_D1_ref;
1571 }
1572}
1573
1574
1575/* Clear simulator state. Has to be initialized before */
1576static
1577void cachesim_clear(void)
1578{
1579 cachesim_clearcache(&I1);
1580 cachesim_clearcache(&D1);
1581 cachesim_clearcache(&L2);
1582
1583 prefetch_clear();
1584}
1585
1586
1587static void cachesim_getdesc(Char* buf)
1588{
1589 Int p;
1590 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1591 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1592 VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1593}
1594
1595static
1596void cachesim_print_opts(void)
1597{
1598 VG_(printf)(
1599"\n cache simulator options:\n"
1600" --simulate-cache=no|yes Do cache simulation [no]\n"
1601" --simulate-wb=no|yes Count write-back events [no]\n"
1602" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1603#if CLG_EXPERIMENTAL
1604" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1605#endif
1606" --cacheuse=no|yes Collect cache block use [no]\n"
1607" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
1608" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
1609" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
1610 );
1611}
1612
1613static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
1614{
1615 int i1, i2, i3;
1616 int i;
1617 char *opt = VG_(strdup)(orig_opt);
1618
1619 i = i1 = opt_len;
1620
1621 /* Option looks like "--I1=65536,2,64".
1622 * Find commas, replace with NULs to make three independent
1623 * strings, then extract numbers. Yuck. */
1624 while (VG_(isdigit)(opt[i])) i++;
1625 if (',' == opt[i]) {
1626 opt[i++] = '\0';
1627 i2 = i;
1628 } else goto bad;
1629 while (VG_(isdigit)(opt[i])) i++;
1630 if (',' == opt[i]) {
1631 opt[i++] = '\0';
1632 i3 = i;
1633 } else goto bad;
1634 while (VG_(isdigit)(opt[i])) i++;
1635 if ('\0' != opt[i]) goto bad;
1636
1637 cache->size = (Int)VG_(atoll)(opt + i1);
1638 cache->assoc = (Int)VG_(atoll)(opt + i2);
1639 cache->line_size = (Int)VG_(atoll)(opt + i3);
1640
1641 VG_(free)(opt);
1642
1643 return;
1644
1645 bad:
sewardj6893d652006-10-15 01:25:13 +00001646 VG_(err_bad_option)(orig_opt);
weidendoa17f2a32006-03-20 10:27:30 +00001647}
1648
1649/* Check for command line option for cache configuration.
1650 * Return False if unknown and not handled.
1651 *
1652 * Called from CLG_(process_cmd_line_option)() in clo.c
1653 */
1654static Bool cachesim_parse_opt(Char* arg)
1655{
1656 if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
1657 clo_simulate_writeback = True;
1658 else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
1659 clo_simulate_writeback = False;
1660
1661 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
1662 clo_simulate_hwpref = True;
1663 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
1664 clo_simulate_hwpref = False;
1665
1666 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
1667 clo_simulate_sectors = True;
1668 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
1669 clo_simulate_sectors = False;
1670
1671 else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
1672 clo_collect_cacheuse = True;
1673 /* Use counters only make sense with fine dumping */
1674 CLG_(clo).dump_instr = True;
1675 }
1676 else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
1677 clo_collect_cacheuse = False;
1678
1679 /* 5 is length of "--I1=" */
1680 else if (0 == VG_(strncmp)(arg, "--I1=", 5))
1681 parse_opt(&clo_I1_cache, arg, 5);
1682 else if (0 == VG_(strncmp)(arg, "--D1=", 5))
1683 parse_opt(&clo_D1_cache, arg, 5);
1684 else if (0 == VG_(strncmp)(arg, "--L2=", 5))
1685 parse_opt(&clo_L2_cache, arg, 5);
1686 else
1687 return False;
1688
1689 return True;
1690}
1691
1692/* Adds commas to ULong, right justifying in a field field_width wide, returns
1693 * the string in buf. */
1694static
1695Int commify(ULong n, int field_width, char* buf)
1696{
1697 int len, n_commas, i, j, new_len, space;
1698
1699 VG_(sprintf)(buf, "%llu", n);
1700 len = VG_(strlen)(buf);
1701 n_commas = (len - 1) / 3;
1702 new_len = len + n_commas;
1703 space = field_width - new_len;
1704
1705 /* Allow for printing a number in a field_width smaller than it's size */
1706 if (space < 0) space = 0;
1707
1708 /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1709 * of three. */
1710 for (j = -1, i = len ; i >= 0; i--) {
1711 buf[i + n_commas + space] = buf[i];
1712
1713 if ((i>0) && (3 == ++j)) {
1714 j = 0;
1715 n_commas--;
1716 buf[i + n_commas + space] = ',';
1717 }
1718 }
1719 /* Right justify in field. */
1720 for (i = 0; i < space; i++) buf[i] = ' ';
1721 return new_len;
1722}
1723
1724static
1725void percentify(Int n, Int ex, Int field_width, char buf[])
1726{
1727 int i, len, space;
1728
1729 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1730 len = VG_(strlen)(buf);
1731 space = field_width - len;
1732 if (space < 0) space = 0; /* Allow for v. small field_width */
1733 i = len;
1734
1735 /* Right justify in field */
1736 for ( ; i >= 0; i--) buf[i + space] = buf[i];
1737 for (i = 0; i < space; i++) buf[i] = ' ';
1738}
1739
1740static
1741void cachesim_printstat(void)
1742{
1743 FullCost total = CLG_(total_cost), D_total = 0;
1744 ULong L2_total_m, L2_total_mr, L2_total_mw,
1745 L2_total, L2_total_r, L2_total_w;
1746 char buf1[RESULTS_BUF_LEN],
1747 buf2[RESULTS_BUF_LEN],
1748 buf3[RESULTS_BUF_LEN];
1749 Int l1, l2, l3;
1750 Int p;
1751
1752 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1753 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
1754 prefetch_up);
1755 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
1756 prefetch_down);
1757 VG_(message)(Vg_DebugMsg, "");
1758 }
1759
1760 /* I cache results. Use the I_refs value to determine the first column
1761 * width. */
1762 l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1763 VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
1764
1765 if (!CLG_(clo).simulate_cache) return;
1766
1767 commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1768 VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
1769
1770 commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1771 VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
1772
1773 p = 100;
1774
1775 if (0 == total[CLG_(sets).off_full_Ir])
1776 total[CLG_(sets).off_full_Ir] = 1;
1777
1778 percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1779 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1780 VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
1781
1782 percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1783 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1784 VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
1785 VG_(message)(Vg_UserMsg, "");
1786
1787 /* D cache results.
1788 Use the D_refs.rd and D_refs.wr values to determine the
1789 * width of columns 2 & 3. */
1790
1791 D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1792 CLG_(init_cost)( CLG_(sets).full, D_total);
1793 CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1794 CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1795
1796 commify( D_total[0], l1, buf1);
1797 l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
1798 l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
1799 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
1800 buf1, buf2, buf3);
1801
1802 commify( D_total[1], l1, buf1);
1803 commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1804 commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1805 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
1806 buf1, buf2, buf3);
1807
1808 commify( D_total[2], l1, buf1);
1809 commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1810 commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1811 VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
1812 buf1, buf2, buf3);
1813
1814 p = 10;
1815
1816 if (0 == D_total[0]) D_total[0] = 1;
1817 if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1818 if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1819
1820 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
1821 percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1822 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1823 percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1824 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1825 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
1826
1827 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
1828 percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1829 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1830 percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1831 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1832 VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
1833 VG_(message)(Vg_UserMsg, "");
1834
1835
1836
1837 /* L2 overall results */
1838
1839 L2_total =
1840 total[CLG_(sets).off_full_Dr +1] +
1841 total[CLG_(sets).off_full_Dw +1] +
1842 total[CLG_(sets).off_full_Ir +1];
1843 L2_total_r =
1844 total[CLG_(sets).off_full_Dr +1] +
1845 total[CLG_(sets).off_full_Ir +1];
1846 L2_total_w = total[CLG_(sets).off_full_Dw +1];
1847 commify(L2_total, l1, buf1);
1848 commify(L2_total_r, l2, buf2);
1849 commify(L2_total_w, l3, buf3);
1850 VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
1851 buf1, buf2, buf3);
1852
1853 L2_total_m =
1854 total[CLG_(sets).off_full_Dr +2] +
1855 total[CLG_(sets).off_full_Dw +2] +
1856 total[CLG_(sets).off_full_Ir +2];
1857 L2_total_mr =
1858 total[CLG_(sets).off_full_Dr +2] +
1859 total[CLG_(sets).off_full_Ir +2];
1860 L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1861 commify(L2_total_m, l1, buf1);
1862 commify(L2_total_mr, l2, buf2);
1863 commify(L2_total_mw, l3, buf3);
1864 VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
1865 buf1, buf2, buf3);
1866
1867 percentify(L2_total_m * 100 * p /
1868 (total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
1869 percentify(L2_total_mr * 100 * p /
1870 (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1871 p, l2+1, buf2);
1872 percentify(L2_total_mw * 100 * p /
1873 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1874 VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
1875 buf1, buf2,buf3);
1876}
1877
1878
1879/*------------------------------------------------------------*/
1880/*--- Setup for Event set. ---*/
1881/*------------------------------------------------------------*/
1882
1883struct event_sets CLG_(sets);
1884
1885void CLG_(init_eventsets)(Int max_user)
1886{
1887 EventType * e1, *e2, *e3, *e4;
1888 EventSet *Ir, *Dr, *Dw;
1889 EventSet *D0, *D1r, *D1w, *D2;
1890 EventSet *sim, *full;
1891 EventSet *use;
1892 int sizeOfUseIr;
1893
1894 use = CLG_(get_eventset)("Use", 4);
1895 if (clo_collect_cacheuse) {
1896 /* if TUse is 0, there was never a load, and no loss, too */
1897 e1 = CLG_(register_eventtype)("AcCost1");
1898 CLG_(add_eventtype)(use, e1);
1899 e1 = CLG_(register_eventtype)("SpLoss1");
1900 CLG_(add_eventtype)(use, e1);
1901 e1 = CLG_(register_eventtype)("AcCost2");
1902 CLG_(add_eventtype)(use, e1);
1903 e1 = CLG_(register_eventtype)("SpLoss2");
1904 CLG_(add_eventtype)(use, e1);
1905 }
1906
1907 Ir = CLG_(get_eventset)("Ir", 4);
1908 Dr = CLG_(get_eventset)("Dr", 4);
1909 Dw = CLG_(get_eventset)("Dw", 4);
1910 if (CLG_(clo).simulate_cache) {
1911 e1 = CLG_(register_eventtype)("Ir");
1912 e2 = CLG_(register_eventtype)("I1mr");
1913 e3 = CLG_(register_eventtype)("I2mr");
1914 if (clo_simulate_writeback) {
1915 e4 = CLG_(register_eventtype)("I2dmr");
1916 CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1917 }
1918 else
1919 CLG_(add_dep_event3)(Ir, e1,e2,e3);
1920
1921 e1 = CLG_(register_eventtype)("Dr");
1922 e2 = CLG_(register_eventtype)("D1mr");
1923 e3 = CLG_(register_eventtype)("D2mr");
1924 if (clo_simulate_writeback) {
1925 e4 = CLG_(register_eventtype)("D2dmr");
1926 CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1927 }
1928 else
1929 CLG_(add_dep_event3)(Dr, e1,e2,e3);
1930
1931 e1 = CLG_(register_eventtype)("Dw");
1932 e2 = CLG_(register_eventtype)("D1mw");
1933 e3 = CLG_(register_eventtype)("D2mw");
1934 if (clo_simulate_writeback) {
1935 e4 = CLG_(register_eventtype)("D2dmw");
1936 CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1937 }
1938 else
1939 CLG_(add_dep_event3)(Dw, e1,e2,e3);
1940
1941 }
1942 else {
1943 e1 = CLG_(register_eventtype)("Ir");
1944 CLG_(add_eventtype)(Ir, e1);
1945 }
1946
1947 sizeOfUseIr = use->size + Ir->size;
1948 D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
1949 CLG_(add_eventset)(D0, use);
1950 off_D0_Ir = CLG_(add_eventset)(D0, Ir);
1951
1952 D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
1953 CLG_(add_eventset)(D1r, use);
1954 off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
1955 off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
1956
1957 D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
1958 CLG_(add_eventset)(D1w, use);
1959 off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
1960 off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
1961
1962 D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
1963 CLG_(add_eventset)(D2, use);
1964 off_D2_Ir = CLG_(add_eventset)(D2, Ir);
1965 off_D2_Dr = CLG_(add_eventset)(D2, Dr);
1966 off_D2_Dw = CLG_(add_eventset)(D2, Dw);
1967
1968 sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
1969 CLG_(add_eventset)(sim, use);
1970 CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
1971 CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
1972 CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
1973
1974 if (CLG_(clo).collect_alloc) max_user += 2;
1975 if (CLG_(clo).collect_systime) max_user += 2;
1976
1977 full = CLG_(get_eventset)("full", sim->size + max_user);
1978 CLG_(add_eventset)(full, sim);
1979 CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
1980 CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
1981 CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
1982
1983 CLG_(sets).use = use;
1984 CLG_(sets).Ir = Ir;
1985 CLG_(sets).Dr = Dr;
1986 CLG_(sets).Dw = Dw;
1987
1988 CLG_(sets).D0 = D0;
1989 CLG_(sets).D1r = D1r;
1990 CLG_(sets).D1w = D1w;
1991 CLG_(sets).D2 = D2;
1992
1993 CLG_(sets).sim = sim;
1994 CLG_(sets).full = full;
1995
1996 if (CLG_(clo).collect_alloc) {
1997 e1 = CLG_(register_eventtype)("allocCount");
1998 e2 = CLG_(register_eventtype)("allocSize");
1999 CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
2000 }
2001
2002 if (CLG_(clo).collect_systime) {
2003 e1 = CLG_(register_eventtype)("sysCount");
2004 e2 = CLG_(register_eventtype)("sysTime");
2005 CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
2006 }
2007
2008 CLG_DEBUGIF(1) {
2009 CLG_DEBUG(1, "EventSets:\n");
2010 CLG_(print_eventset)(-2, use);
2011 CLG_(print_eventset)(-2, Ir);
2012 CLG_(print_eventset)(-2, Dr);
2013 CLG_(print_eventset)(-2, Dw);
2014 CLG_(print_eventset)(-2, sim);
2015 CLG_(print_eventset)(-2, full);
2016 }
2017
2018 /* Not-existing events are silently ignored */
2019 CLG_(dumpmap) = CLG_(get_eventmapping)(full);
2020 CLG_(append_event)(CLG_(dumpmap), "Ir");
2021 CLG_(append_event)(CLG_(dumpmap), "Dr");
2022 CLG_(append_event)(CLG_(dumpmap), "Dw");
2023 CLG_(append_event)(CLG_(dumpmap), "I1mr");
2024 CLG_(append_event)(CLG_(dumpmap), "D1mr");
2025 CLG_(append_event)(CLG_(dumpmap), "D1mw");
2026 CLG_(append_event)(CLG_(dumpmap), "I2mr");
2027 CLG_(append_event)(CLG_(dumpmap), "D2mr");
2028 CLG_(append_event)(CLG_(dumpmap), "D2mw");
2029 CLG_(append_event)(CLG_(dumpmap), "I2dmr");
2030 CLG_(append_event)(CLG_(dumpmap), "D2dmr");
2031 CLG_(append_event)(CLG_(dumpmap), "D2dmw");
2032 CLG_(append_event)(CLG_(dumpmap), "AcCost1");
2033 CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
2034 CLG_(append_event)(CLG_(dumpmap), "AcCost2");
2035 CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
2036 CLG_(append_event)(CLG_(dumpmap), "allocCount");
2037 CLG_(append_event)(CLG_(dumpmap), "allocSize");
2038 CLG_(append_event)(CLG_(dumpmap), "sysCount");
2039 CLG_(append_event)(CLG_(dumpmap), "sysTime");
2040
2041}
2042
2043
2044
2045static
2046void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
2047{
2048 /* if eventset use is defined, it is always first (hardcoded!) */
2049 CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
2050
2051 /* FIXME: This is hardcoded... */
2052 if (es == CLG_(sets).D0) {
2053 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2054 cost + off_D0_Ir);
2055 }
2056 else if (es == CLG_(sets).D1r) {
2057 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2058 cost + off_D1r_Ir);
2059 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2060 cost + off_D1r_Dr);
2061 }
2062 else if (es == CLG_(sets).D1w) {
2063 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2064 cost + off_D1w_Ir);
2065 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2066 cost + off_D1w_Dw);
2067 }
2068 else {
2069 CLG_ASSERT(es == CLG_(sets).D2);
2070 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2071 cost + off_D2_Ir);
2072 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2073 cost + off_D2_Dr);
2074 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2075 cost + off_D2_Dw);
2076 }
2077}
2078
2079/* this is called at dump time for every instruction executed */
2080static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
2081 InstrInfo* ii, ULong exe_count)
2082{
2083 if (!CLG_(clo).simulate_cache)
2084 cost[CLG_(sets).off_sim_Ir] += exe_count;
2085 else {
2086
2087#if 0
2088/* There is always a trivial case where exe_count and Ir can be
2089 * slightly different because ecounter is updated when executing
2090 * the next BB. E.g. for last BB executed, or when toggling collection
2091 */
2092 /* FIXME: Hardcoded that each eventset has Ir as first */
2093 if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
2094 VG_(printf)("==> Ir %llu, exe %llu\n",
2095 (bbcc->cost + ii->cost_offset)[0], exe_count);
2096 CLG_(print_bbcc_cost)(-2, bbcc);
2097 //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
2098 }
2099#endif
2100
2101 add_and_zero_Dx(ii->eventset, cost,
2102 bbcc->cost + ii->cost_offset);
2103 }
2104}
2105
2106static
2107void cachesim_after_bbsetup(void)
2108{
2109 BBCC* bbcc = CLG_(current_state).bbcc;
2110
2111 if (CLG_(clo).simulate_cache) {
2112 BB* bb = bbcc->bb;
2113
2114 /* only needed if log_* functions are called */
2115 bb_base = bb->obj->offset + bb->offset;
2116 cost_base = bbcc->cost;
2117 }
2118}
2119
2120static
2121void cachesim_finish(void)
2122{
2123 if (clo_collect_cacheuse)
2124 cacheuse_finish();
2125}
2126
2127/*------------------------------------------------------------*/
2128/*--- The simulator defined in this file ---*/
2129/*------------------------------------------------------------*/
2130
2131struct cachesim_if CLG_(cachesim) = {
2132 .print_opts = cachesim_print_opts,
2133 .parse_opt = cachesim_parse_opt,
2134 .post_clo_init = cachesim_post_clo_init,
2135 .clear = cachesim_clear,
2136 .getdesc = cachesim_getdesc,
2137 .printstat = cachesim_printstat,
2138 .add_icost = cachesim_add_icost,
2139 .after_bbsetup = cachesim_after_bbsetup,
2140 .finish = cachesim_finish,
2141
2142 /* these will be set by cachesim_post_clo_init */
2143 .log_1I0D = 0,
2144
2145 .log_1I1Dr = 0,
2146 .log_1I1Dw = 0,
2147 .log_1I2D = 0,
2148
2149 .log_0I1Dr = 0,
2150 .log_0I1Dw = 0,
2151 .log_0I2D = 0,
2152
2153 .log_1I0D_name = "(no function)",
2154
2155 .log_1I1Dr_name = "(no function)",
2156 .log_1I1Dw_name = "(no function)",
2157 .log_1I2D_name = "(no function)",
2158
2159 .log_0I1Dr_name = "(no function)",
2160 .log_0I1Dw_name = "(no function)",
2161 .log_0I2D_name = "(no function)"
2162};
2163
2164
2165/*--------------------------------------------------------------------*/
2166/*--- end ct_sim.c ---*/
2167/*--------------------------------------------------------------------*/
2168