blob: 1c4c3b5de414424cae8426680ac5f3e279b57ea9 [file] [log] [blame]
weidendoa17f2a32006-03-20 10:27:30 +00001
2/*--------------------------------------------------------------------*/
3/*--- Cache simulation. ---*/
4/*--- sim.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8 This file is part of Callgrind.
9 (c) 2003-2005, Josef Weidendorfer
10
11 Parts are Copyright (C) 2002 Nicholas Nethercote
12 njn25@cam.ac.uk
13
14
15 This program is free software; you can redistribute it and/or
16 modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation; either version 2 of the
18 License, or (at your option) any later version.
19
20 This program is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 General Public License for more details.
24
25 You should have received a copy of the GNU General Public License
26 along with this program; if not, write to the Free Software
27 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 02111-1307, USA.
29
30 The GNU General Public License is contained in the file COPYING.
31*/
32
33#include "global.h"
34
35
36/* Notes:
37 - simulates a write-allocate cache
38 - (block --> set) hash function uses simple bit selection
39 - handling of references straddling two cache blocks:
40 - counts as only one cache access (not two)
41 - both blocks hit --> one hit
42 - one block hits, the other misses --> one miss
43 - both blocks miss --> one miss (not two)
44*/
45
46/* Cache configuration */
47#include "cg_arch.h"
48
49/* additional structures for cache use info, separated
50 * according usage frequency:
51 * - line_loaded : pointer to cost center of instruction
52 * which loaded the line into cache.
53 * Needed to increment counters when line is evicted.
54 * - line_use : updated on every access
55 */
56typedef struct {
57 UInt count;
58 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
59} line_use;
60
61typedef struct {
62 Addr memline, iaddr;
63 line_use* dep_use; /* point to higher-level cacheblock for this memline */
64 ULong* use_base;
65} line_loaded;
66
67/* Cache state */
68typedef struct {
69 char* name;
70 int size; /* bytes */
71 int assoc;
72 int line_size; /* bytes */
73 Bool sectored; /* prefetch nearside cacheline on read */
74 int sets;
75 int sets_min_1;
76 int assoc_bits;
77 int line_size_bits;
78 int tag_shift;
79 UWord tag_mask;
80 char desc_line[128];
81 UWord* tags;
82
83 /* for cache use */
84 int line_size_mask;
85 int* line_start_mask;
86 int* line_end_mask;
87 line_loaded* loaded;
88 line_use* use;
89} cache_t2;
90
91/*
92 * States of flat caches in our model.
93 * We use a 2-level hierarchy,
94 */
95static cache_t2 I1, D1, L2;
96
97/* Lower bits of cache tags are used as flags for a cache line */
98#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
99#define CACHELINE_DIRTY 1
100
101
102/* Cache simulator Options */
103static Bool clo_simulate_writeback = False;
104static Bool clo_simulate_hwpref = False;
105static Bool clo_simulate_sectors = False;
106static Bool clo_collect_cacheuse = False;
107
108/* Following global vars are setup before by
109 * setup_bbcc()/cachesim_after_bbsetup():
110 *
111 * - Addr bb_base (instruction start address of original BB)
112 * - ULong* cost_base (start of cost array for BB)
113 * - BBCC* nonskipped (only != 0 when in a function not skipped)
114 */
115
116/* Offset to events in event set, used in log_* functions */
117static Int off_D0_Ir;
118static Int off_D1r_Ir;
119static Int off_D1r_Dr;
120static Int off_D1w_Ir;
121static Int off_D1w_Dw;
122static Int off_D2_Ir;
123static Int off_D2_Dr;
124static Int off_D2_Dw;
125
126static Addr bb_base;
127static ULong* cost_base;
128static InstrInfo* current_ii;
129
130/* Cache use offsets */
131/* FIXME: The offsets are only correct because all eventsets get
132 * the "Use" set added first !
133 */
134static Int off_I1_AcCost = 0;
135static Int off_I1_SpLoss = 1;
136static Int off_D1_AcCost = 0;
137static Int off_D1_SpLoss = 1;
138static Int off_L2_AcCost = 2;
139static Int off_L2_SpLoss = 3;
140
141/* Cache access types */
142typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
143
144/* Result of a reference into a flat cache */
145typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
146
147/* Result of a reference into a hierarchical cache model */
148typedef enum {
149 L1_Hit,
150 L2_Hit,
151 MemAccess,
152 WriteBackMemAccess } CacheModelResult;
153
154typedef CacheModelResult (*simcall_type)(Addr, UChar);
155
156static struct {
157 simcall_type I1_Read;
158 simcall_type D1_Read;
159 simcall_type D1_Write;
160} simulator;
161
162/*------------------------------------------------------------*/
163/*--- Cache Simulator Initialization ---*/
164/*------------------------------------------------------------*/
165
166static void cachesim_clearcache(cache_t2* c)
167{
168 Int i;
169
170 for (i = 0; i < c->sets * c->assoc; i++)
171 c->tags[i] = 0;
172 if (c->use) {
173 for (i = 0; i < c->sets * c->assoc; i++) {
174 c->loaded[i].memline = 0;
175 c->loaded[i].use_base = 0;
176 c->loaded[i].dep_use = 0;
177 c->loaded[i].iaddr = 0;
178 c->use[i].mask = 0;
179 c->use[i].count = 0;
180 c->tags[i] = i % c->assoc; /* init lower bits as pointer */
181 }
182 }
183}
184
185static void cacheuse_initcache(cache_t2* c);
186
187/* By this point, the size/assoc/line_size has been checked. */
188static void cachesim_initcache(cache_t config, cache_t2* c)
189{
190 c->size = config.size;
191 c->assoc = config.assoc;
192 c->line_size = config.line_size;
193 c->sectored = False; // FIXME
194
195 c->sets = (c->size / c->line_size) / c->assoc;
196 c->sets_min_1 = c->sets - 1;
197 c->assoc_bits = VG_(log2)(c->assoc);
198 c->line_size_bits = VG_(log2)(c->line_size);
199 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
200 c->tag_mask = ~((1<<c->tag_shift)-1);
201
202 /* Can bits in tag entries be used for flags?
203 * Should be always true as MIN_LINE_SIZE >= 16 */
204 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
205
206 if (c->assoc == 1) {
207 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
208 c->size, c->line_size,
209 c->sectored ? ", sectored":"");
210 } else {
211 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
212 c->size, c->line_size, c->assoc,
213 c->sectored ? ", sectored":"");
214 }
215
216 c->tags = (UWord*) CLG_MALLOC(sizeof(UWord) * c->sets * c->assoc);
217 if (clo_collect_cacheuse)
218 cacheuse_initcache(c);
219 else
220 c->use = 0;
221 cachesim_clearcache(c);
222}
223
224
225#if 0
226static void print_cache(cache_t2* c)
227{
228 UInt set, way, i;
229
230 /* Note initialisation and update of 'i'. */
231 for (i = 0, set = 0; set < c->sets; set++) {
232 for (way = 0; way < c->assoc; way++, i++) {
233 VG_(printf)("%8x ", c->tags[i]);
234 }
235 VG_(printf)("\n");
236 }
237}
238#endif
239
240
241/*------------------------------------------------------------*/
242/*--- Write Through Cache Simulation ---*/
243/*------------------------------------------------------------*/
244
245/*
246 * Simple model: L1 & L2 Write Through
247 * Does not distinguish among read and write references
248 *
249 * Simulator functions:
250 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
251 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
252 */
253
254static __inline__
255CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
256{
257 int i, j;
258 UWord *set;
259
260 /* Shifting is a bit faster than multiplying */
261 set = &(c->tags[set_no << c->assoc_bits]);
262
263 /* This loop is unrolled for just the first case, which is the most */
264 /* common. We can't unroll any further because it would screw up */
265 /* if we have a direct-mapped (1-way) cache. */
266 if (tag == set[0])
267 return Hit;
268
269 /* If the tag is one other than the MRU, move it into the MRU spot */
270 /* and shuffle the rest down. */
271 for (i = 1; i < c->assoc; i++) {
272 if (tag == set[i]) {
273 for (j = i; j > 0; j--) {
274 set[j] = set[j - 1];
275 }
276 set[0] = tag;
277 return Hit;
278 }
279 }
280
281 /* A miss; install this tag as MRU, shuffle rest down. */
282 for (j = c->assoc - 1; j > 0; j--) {
283 set[j] = set[j - 1];
284 }
285 set[0] = tag;
286
287 return Miss;
288}
289
290static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
291{
292 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
293 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
294 UWord tag = a >> c->tag_shift;
295
296 /* Access entirely within line. */
297 if (set1 == set2)
298 return cachesim_setref(c, set1, tag);
299
300 /* Access straddles two lines. */
301 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
302 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000303 UWord tag2 = (a+size-1) >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000304
305 /* the call updates cache structures as side effect */
306 CacheResult res1 = cachesim_setref(c, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000307 CacheResult res2 = cachesim_setref(c, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000308 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
309
310 } else {
311 VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
312 VG_(tool_panic)("item straddles more than two cache sets");
313 }
314 return Hit;
315}
316
317static
318CacheModelResult cachesim_I1_ref(Addr a, UChar size)
319{
320 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
321 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
322 return MemAccess;
323}
324
325static
326CacheModelResult cachesim_D1_ref(Addr a, UChar size)
327{
328 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
329 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
330 return MemAccess;
331}
332
333
334/*------------------------------------------------------------*/
335/*--- Write Back Cache Simulation ---*/
336/*------------------------------------------------------------*/
337
338/*
339 * More complex model: L1 Write-through, L2 Write-back
340 * This needs to distinguish among read and write references.
341 *
342 * Simulator functions:
343 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
344 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
345 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
346 */
347
348/*
349 * With write-back, result can be a miss evicting a dirty line
350 * The dirty state of a cache line is stored in Bit0 of the tag for
351 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
352 * type (Read/Write), the line gets dirty on a write.
353 */
354static __inline__
355CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
356{
357 int i, j;
358 UWord *set, tmp_tag;
359
360 /* Shifting is a bit faster than multiplying */
361 set = &(c->tags[set_no << c->assoc_bits]);
362
363 /* This loop is unrolled for just the first case, which is the most */
364 /* common. We can't unroll any further because it would screw up */
365 /* if we have a direct-mapped (1-way) cache. */
366 if (tag == (set[0] & ~CACHELINE_DIRTY)) {
367 set[0] |= ref;
368 return Hit;
369 }
370 /* If the tag is one other than the MRU, move it into the MRU spot */
371 /* and shuffle the rest down. */
372 for (i = 1; i < c->assoc; i++) {
373 if (tag == (set[i] & ~CACHELINE_DIRTY)) {
374 tmp_tag = set[i] | ref; // update dirty flag
375 for (j = i; j > 0; j--) {
376 set[j] = set[j - 1];
377 }
378 set[0] = tmp_tag;
379 return Hit;
380 }
381 }
382
383 /* A miss; install this tag as MRU, shuffle rest down. */
384 tmp_tag = set[c->assoc - 1];
385 for (j = c->assoc - 1; j > 0; j--) {
386 set[j] = set[j - 1];
387 }
388 set[0] = tag | ref;
389
390 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
391}
392
393
394static __inline__
395CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
396{
397 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
398 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
399 UWord tag = a & c->tag_mask;
400
401 /* Access entirely within line. */
402 if (set1 == set2)
403 return cachesim_setref_wb(c, ref, set1, tag);
404
405 /* Access straddles two lines. */
406 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
407 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000408 UWord tag2 = (a+size-1) >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000409
410 /* the call updates cache structures as side effect */
411 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000412 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000413
414 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
415 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
416
417 } else {
418 VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
419 VG_(tool_panic)("item straddles more than two cache sets");
420 }
421 return Hit;
422}
423
424
425static
426CacheModelResult cachesim_I1_Read(Addr a, UChar size)
427{
428 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
429 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
430 case Hit: return L2_Hit;
431 case Miss: return MemAccess;
432 default: break;
433 }
434 return WriteBackMemAccess;
435}
436
437static
438CacheModelResult cachesim_D1_Read(Addr a, UChar size)
439{
440 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
441 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
442 case Hit: return L2_Hit;
443 case Miss: return MemAccess;
444 default: break;
445 }
446 return WriteBackMemAccess;
447}
448
449static
450CacheModelResult cachesim_D1_Write(Addr a, UChar size)
451{
452 if ( cachesim_ref( &D1, a, size) == Hit ) {
453 /* Even for a L1 hit, the write-trough L1 passes
454 * the write to the L2 to make the L2 line dirty.
455 * But this causes no latency, so return the hit.
456 */
457 cachesim_ref_wb( &L2, Write, a, size);
458 return L1_Hit;
459 }
460 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
461 case Hit: return L2_Hit;
462 case Miss: return MemAccess;
463 default: break;
464 }
465 return WriteBackMemAccess;
466}
467
468
469/*------------------------------------------------------------*/
470/*--- Hardware Prefetch Simulation ---*/
471/*------------------------------------------------------------*/
472
473static ULong prefetch_up = 0;
474static ULong prefetch_down = 0;
475
476#define PF_STREAMS 8
477#define PF_PAGEBITS 12
478
479static UInt pf_lastblock[PF_STREAMS];
480static Int pf_seqblocks[PF_STREAMS];
481
482static
483void prefetch_clear(void)
484{
485 int i;
486 for(i=0;i<PF_STREAMS;i++)
487 pf_lastblock[i] = pf_seqblocks[i] = 0;
488}
489
490/*
491 * HW Prefetch emulation
492 * Start prefetching when detecting sequential access to 3 memory blocks.
493 * One stream can be detected per 4k page.
494 */
495static __inline__
496void prefetch_L2_doref(Addr a, UChar size)
497{
498 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
499 UInt block = ( a >> L2.line_size_bits);
500
501 if (block != pf_lastblock[stream]) {
502 if (pf_seqblocks[stream] == 0) {
503 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
504 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
505 }
506 else if (pf_seqblocks[stream] >0) {
507 if (pf_lastblock[stream] +1 == block) {
508 pf_seqblocks[stream]++;
509 if (pf_seqblocks[stream] >= 2) {
510 prefetch_up++;
511 cachesim_ref(&L2, a + 5 * L2.line_size,1);
512 }
513 }
514 else pf_seqblocks[stream] = 0;
515 }
516 else if (pf_seqblocks[stream] <0) {
517 if (pf_lastblock[stream] -1 == block) {
518 pf_seqblocks[stream]--;
519 if (pf_seqblocks[stream] <= -2) {
520 prefetch_down++;
521 cachesim_ref(&L2, a - 5 * L2.line_size,1);
522 }
523 }
524 else pf_seqblocks[stream] = 0;
525 }
526 pf_lastblock[stream] = block;
527 }
528}
529
530/* simple model with hardware prefetch */
531
532static
533CacheModelResult prefetch_I1_ref(Addr a, UChar size)
534{
535 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
536 prefetch_L2_doref(a,size);
537 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
538 return MemAccess;
539}
540
541static
542CacheModelResult prefetch_D1_ref(Addr a, UChar size)
543{
544 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
545 prefetch_L2_doref(a,size);
546 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
547 return MemAccess;
548}
549
550
551/* complex model with hardware prefetch */
552
553static
554CacheModelResult prefetch_I1_Read(Addr a, UChar size)
555{
556 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
557 prefetch_L2_doref(a,size);
558 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
559 case Hit: return L2_Hit;
560 case Miss: return MemAccess;
561 default: break;
562 }
563 return WriteBackMemAccess;
564}
565
566static
567CacheModelResult prefetch_D1_Read(Addr a, UChar size)
568{
569 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
570 prefetch_L2_doref(a,size);
571 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
572 case Hit: return L2_Hit;
573 case Miss: return MemAccess;
574 default: break;
575 }
576 return WriteBackMemAccess;
577}
578
579static
580CacheModelResult prefetch_D1_Write(Addr a, UChar size)
581{
582 prefetch_L2_doref(a,size);
583 if ( cachesim_ref( &D1, a, size) == Hit ) {
584 /* Even for a L1 hit, the write-trough L1 passes
585 * the write to the L2 to make the L2 line dirty.
586 * But this causes no latency, so return the hit.
587 */
588 cachesim_ref_wb( &L2, Write, a, size);
589 return L1_Hit;
590 }
591 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
592 case Hit: return L2_Hit;
593 case Miss: return MemAccess;
594 default: break;
595 }
596 return WriteBackMemAccess;
597}
598
599
600/*------------------------------------------------------------*/
601/*--- Cache Simulation with use metric collection ---*/
602/*------------------------------------------------------------*/
603
604/* can not be combined with write-back or prefetch */
605
606static
607void cacheuse_initcache(cache_t2* c)
608{
609 int i;
610 unsigned int start_mask, start_val;
611 unsigned int end_mask, end_val;
612
613 c->use = CLG_MALLOC(sizeof(line_use) * c->sets * c->assoc);
614 c->loaded = CLG_MALLOC(sizeof(line_loaded) * c->sets * c->assoc);
615 c->line_start_mask = CLG_MALLOC(sizeof(int) * c->line_size);
616 c->line_end_mask = CLG_MALLOC(sizeof(int) * c->line_size);
617
618
619 c->line_size_mask = c->line_size-1;
620
621 /* Meaning of line_start_mask/line_end_mask
622 * Example: for a given cache line, you get an access starting at
623 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
624 * line size of 32, you have 1 bit per byte in the mask:
625 *
626 * bit31 bit8 bit5 bit 0
627 * | | | |
628 * 11..111111100000 line_start_mask[5]
629 * 00..000111111111 line_end_mask[(5+4)-1]
630 *
631 * use_mask |= line_start_mask[5] && line_end_mask[8]
632 *
633 */
634 start_val = end_val = ~0;
635 if (c->line_size < 32) {
636 int bits_per_byte = 32/c->line_size;
637 start_mask = (1<<bits_per_byte)-1;
638 end_mask = start_mask << (32-bits_per_byte);
639 for(i=0;i<c->line_size;i++) {
640 c->line_start_mask[i] = start_val;
641 start_val = start_val & ~start_mask;
642 start_mask = start_mask << bits_per_byte;
643
644 c->line_end_mask[c->line_size-i-1] = end_val;
645 end_val = end_val & ~end_mask;
646 end_mask = end_mask >> bits_per_byte;
647 }
648 }
649 else {
650 int bytes_per_bit = c->line_size/32;
651 start_mask = 1;
652 end_mask = 1 << 31;
653 for(i=0;i<c->line_size;i++) {
654 c->line_start_mask[i] = start_val;
655 c->line_end_mask[c->line_size-i-1] = end_val;
656 if ( ((i+1)%bytes_per_bit) == 0) {
657 start_val &= ~start_mask;
658 end_val &= ~end_mask;
659 start_mask <<= 1;
660 end_mask >>= 1;
661 }
662 }
663 }
664
665 CLG_DEBUG(6, "Config %s:\n", c->desc_line);
666 for(i=0;i<c->line_size;i++) {
667 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
668 i, c->line_start_mask[i], c->line_end_mask[i]);
669 }
670
671 /* We use lower tag bits as offset pointers to cache use info.
672 * I.e. some cache parameters don't work.
673 */
674 if (c->tag_shift < c->assoc_bits) {
675 VG_(message)(Vg_DebugMsg,
676 "error: Use associativity < %d for cache use statistics!",
677 (1<<c->tag_shift) );
678 VG_(tool_panic)("Unsupported cache configuration");
679 }
680}
681
682/* FIXME: A little tricky */
683#if 0
684
685static __inline__
686void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
687{
688 int idx = (high_idx << c->assoc_bits) | low_idx;
689
690 c->use[idx].count ++;
691 c->use[idx].mask |= use_mask;
692
693 CLG_DEBUG(6," Hit [idx %d] (line %p from %p): %x => %08x, count %d\n",
694 idx, c->loaded[idx].memline, c->loaded[idx].iaddr,
695 use_mask, c->use[idx].mask, c->use[idx].count);
696}
697
698/* only used for I1, D1 */
699
700static __inline__
701CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
702{
703 int i, j, idx;
704 UWord *set, tmp_tag;
705 UInt use_mask;
706
707 /* Shifting is a bit faster than multiplying */
708 set = &(c->tags[set_no << c->assoc_bits]);
709 use_mask =
710 c->line_start_mask[a & c->line_size_mask] &
711 c->line_end_mask[(a+size-1) & c->line_size_mask];
712
713 /* This loop is unrolled for just the first case, which is the most */
714 /* common. We can't unroll any further because it would screw up */
715 /* if we have a direct-mapped (1-way) cache. */
716 if (tag == (set[0] & c->tag_mask)) {
717 cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
718 return L1_Hit;
719 }
720
721 /* If the tag is one other than the MRU, move it into the MRU spot */
722 /* and shuffle the rest down. */
723 for (i = 1; i < c->assoc; i++) {
724 if (tag == (set[i] & c->tag_mask)) {
725 tmp_tag = set[i];
726 for (j = i; j > 0; j--) {
727 set[j] = set[j - 1];
728 }
729 set[0] = tmp_tag;
730
731 cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
732 return L1_Hit;
733 }
734 }
735
736 /* A miss; install this tag as MRU, shuffle rest down. */
737 tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
738 for (j = c->assoc - 1; j > 0; j--) {
739 set[j] = set[j - 1];
740 }
741 set[0] = tag | tmp_tag;
742
743 cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
744 use_mask, a & ~c->line_size_mask);
745
746 return Miss;
747}
748
749
750static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
751{
752 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
753 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
754 UWord tag = a >> c->tag_shift;
755
756 /* Access entirely within line. */
757 if (set1 == set2)
758 return cacheuse_setref(c, set1, tag);
759
760 /* Access straddles two lines. */
761 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
762 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000763 UWord tag2 = a >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000764
765 /* the call updates cache structures as side effect */
766 CacheResult res1 = cacheuse_isMiss(c, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000767 CacheResult res2 = cacheuse_isMiss(c, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000768 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
769
770 } else {
771 VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
772 VG_(tool_panic)("item straddles more than two cache sets");
773 }
774 return Hit;
775}
776#endif
777
778
779/* for I1/D1 caches */
780#define CACHEUSE(L) \
781 \
782static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
783{ \
weidendo28e2a142006-11-22 21:00:53 +0000784 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
785 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
786 UWord tag = a & L.tag_mask; \
787 UWord tag2; \
weidendoa17f2a32006-03-20 10:27:30 +0000788 int i, j, idx; \
789 UWord *set, tmp_tag; \
790 UInt use_mask; \
791 \
792 CLG_DEBUG(6,"%s.Acc(Addr %p, size %d): Sets [%d/%d]\n", \
793 L.name, a, size, set1, set2); \
794 \
795 /* First case: word entirely within line. */ \
796 if (set1 == set2) { \
797 \
798 /* Shifting is a bit faster than multiplying */ \
799 set = &(L.tags[set1 << L.assoc_bits]); \
800 use_mask = L.line_start_mask[a & L.line_size_mask] & \
801 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
802 \
803 /* This loop is unrolled for just the first case, which is the most */\
804 /* common. We can't unroll any further because it would screw up */\
805 /* if we have a direct-mapped (1-way) cache. */\
806 if (tag == (set[0] & L.tag_mask)) { \
807 idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
808 L.use[idx].count ++; \
809 L.use[idx].mask |= use_mask; \
810 CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
811 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
812 use_mask, L.use[idx].mask, L.use[idx].count); \
813 return L1_Hit; \
814 } \
815 /* If the tag is one other than the MRU, move it into the MRU spot */\
816 /* and shuffle the rest down. */\
817 for (i = 1; i < L.assoc; i++) { \
818 if (tag == (set[i] & L.tag_mask)) { \
819 tmp_tag = set[i]; \
820 for (j = i; j > 0; j--) { \
821 set[j] = set[j - 1]; \
822 } \
823 set[0] = tmp_tag; \
824 idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
825 L.use[idx].count ++; \
826 L.use[idx].mask |= use_mask; \
827 CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
828 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
829 use_mask, L.use[idx].mask, L.use[idx].count); \
830 return L1_Hit; \
831 } \
832 } \
833 \
834 /* A miss; install this tag as MRU, shuffle rest down. */ \
835 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
836 for (j = L.assoc - 1; j > 0; j--) { \
837 set[j] = set[j - 1]; \
838 } \
839 set[0] = tag | tmp_tag; \
840 idx = (set1 << L.assoc_bits) | tmp_tag; \
841 return update_##L##_use(&L, idx, \
842 use_mask, a &~ L.line_size_mask); \
843 \
844 /* Second case: word straddles two lines. */ \
845 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
846 } else if (((set1 + 1) & (L.sets-1)) == set2) { \
847 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
848 set = &(L.tags[set1 << L.assoc_bits]); \
849 use_mask = L.line_start_mask[a & L.line_size_mask]; \
850 if (tag == (set[0] & L.tag_mask)) { \
851 idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
852 L.use[idx].count ++; \
853 L.use[idx].mask |= use_mask; \
854 CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
855 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
856 use_mask, L.use[idx].mask, L.use[idx].count); \
857 goto block2; \
858 } \
859 for (i = 1; i < L.assoc; i++) { \
860 if (tag == (set[i] & L.tag_mask)) { \
861 tmp_tag = set[i]; \
862 for (j = i; j > 0; j--) { \
863 set[j] = set[j - 1]; \
864 } \
865 set[0] = tmp_tag; \
866 idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
867 L.use[idx].count ++; \
868 L.use[idx].mask |= use_mask; \
869 CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
870 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
871 use_mask, L.use[idx].mask, L.use[idx].count); \
872 goto block2; \
873 } \
874 } \
875 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
876 for (j = L.assoc - 1; j > 0; j--) { \
877 set[j] = set[j - 1]; \
878 } \
879 set[0] = tag | tmp_tag; \
880 idx = (set1 << L.assoc_bits) | tmp_tag; \
881 miss1 = update_##L##_use(&L, idx, \
882 use_mask, a &~ L.line_size_mask); \
883block2: \
884 set = &(L.tags[set2 << L.assoc_bits]); \
885 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
weidendo28e2a142006-11-22 21:00:53 +0000886 tag2 = (a+size-1) & L.tag_mask; \
887 if (tag2 == (set[0] & L.tag_mask)) { \
weidendoa17f2a32006-03-20 10:27:30 +0000888 idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
889 L.use[idx].count ++; \
890 L.use[idx].mask |= use_mask; \
891 CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
892 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
893 use_mask, L.use[idx].mask, L.use[idx].count); \
894 return miss1; \
895 } \
896 for (i = 1; i < L.assoc; i++) { \
weidendo28e2a142006-11-22 21:00:53 +0000897 if (tag2 == (set[i] & L.tag_mask)) { \
weidendoa17f2a32006-03-20 10:27:30 +0000898 tmp_tag = set[i]; \
899 for (j = i; j > 0; j--) { \
900 set[j] = set[j - 1]; \
901 } \
902 set[0] = tmp_tag; \
903 idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
904 L.use[idx].count ++; \
905 L.use[idx].mask |= use_mask; \
906 CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
907 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
908 use_mask, L.use[idx].mask, L.use[idx].count); \
909 return miss1; \
910 } \
911 } \
912 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
913 for (j = L.assoc - 1; j > 0; j--) { \
914 set[j] = set[j - 1]; \
915 } \
weidendo28e2a142006-11-22 21:00:53 +0000916 set[0] = tag2 | tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000917 idx = (set2 << L.assoc_bits) | tmp_tag; \
918 miss2 = update_##L##_use(&L, idx, \
919 use_mask, (a+size-1) &~ L.line_size_mask); \
920 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
921 \
922 } else { \
923 VG_(printf)("addr: %p size: %u sets: %d %d", a, size, set1, set2); \
924 VG_(tool_panic)("item straddles more than two cache sets"); \
925 } \
926 return 0; \
927}
928
929
930/* logarithmic bitcounting algorithm, see
931 * http://graphics.stanford.edu/~seander/bithacks.html
932 */
933static __inline__ unsigned int countBits(unsigned int bits)
934{
935 unsigned int c; // store the total here
936 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
937 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
938
939 c = bits;
940 c = ((c >> S[0]) & B[0]) + (c & B[0]);
941 c = ((c >> S[1]) & B[1]) + (c & B[1]);
942 c = ((c >> S[2]) & B[2]) + (c & B[2]);
943 c = ((c >> S[3]) & B[3]) + (c & B[3]);
944 c = ((c >> S[4]) & B[4]) + (c & B[4]);
945 return c;
946}
947
948static void update_L2_use(int idx, Addr memline)
949{
950 line_loaded* loaded = &(L2.loaded[idx]);
951 line_use* use = &(L2.use[idx]);
952 int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
953
954 CLG_DEBUG(2, " L2.miss [%d]: at %p accessing memline %p\n",
955 idx, bb_base + current_ii->instr_offset, memline);
956 if (use->count>0) {
957 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %p from %p]\n",
958 use->count, i, use->mask, loaded->memline, loaded->iaddr);
959 CLG_DEBUG(2, " collect: %d, use_base %p\n",
960 CLG_(current_state).collect, loaded->use_base);
961
962 if (CLG_(current_state).collect && loaded->use_base) {
963 (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
964 (loaded->use_base)[off_L2_SpLoss] += i;
965 }
966 }
967
968 use->count = 0;
969 use->mask = 0;
970
971 loaded->memline = memline;
972 loaded->iaddr = bb_base + current_ii->instr_offset;
973 loaded->use_base = (CLG_(current_state).nonskipped) ?
974 CLG_(current_state).nonskipped->skipped :
975 cost_base + current_ii->cost_offset;
976}
977
978static
979CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
980{
981 UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
982 UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
983 UWord tag = memline & L2.tag_mask;
984
985 int i, j, idx;
986 UWord tmp_tag;
987
988 CLG_DEBUG(6,"L2.Acc(Memline %p): Set %d\n", memline, setNo);
989
990 if (tag == (set[0] & L2.tag_mask)) {
991 idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
992 l1_loaded->dep_use = &(L2.use[idx]);
993
994 CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): => %08x, count %d\n",
995 idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
996 L2.use[idx].mask, L2.use[idx].count);
997 return L2_Hit;
998 }
999 for (i = 1; i < L2.assoc; i++) {
1000 if (tag == (set[i] & L2.tag_mask)) {
1001 tmp_tag = set[i];
1002 for (j = i; j > 0; j--) {
1003 set[j] = set[j - 1];
1004 }
1005 set[0] = tmp_tag;
1006 idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
1007 l1_loaded->dep_use = &(L2.use[idx]);
1008
1009 CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): => %08x, count %d\n",
1010 i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
1011 L2.use[idx].mask, L2.use[idx].count);
1012 return L2_Hit;
1013 }
1014 }
1015
1016 /* A miss; install this tag as MRU, shuffle rest down. */
1017 tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
1018 for (j = L2.assoc - 1; j > 0; j--) {
1019 set[j] = set[j - 1];
1020 }
1021 set[0] = tag | tmp_tag;
1022 idx = (setNo << L2.assoc_bits) | tmp_tag;
1023 l1_loaded->dep_use = &(L2.use[idx]);
1024
1025 update_L2_use(idx, memline);
1026
1027 return MemAccess;
1028}
1029
1030
1031
1032
1033#define UPDATE_USE(L) \
1034 \
1035static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
1036 UInt mask, Addr memline) \
1037{ \
1038 line_loaded* loaded = &(cache->loaded[idx]); \
1039 line_use* use = &(cache->use[idx]); \
1040 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
1041 \
1042 CLG_DEBUG(2, " %s.miss [%d]: at %p accessing memline %p (mask %08x)\n", \
1043 cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
1044 if (use->count>0) { \
1045 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %p from %p]\n",\
1046 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
1047 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
1048 CLG_(current_state).collect, loaded->use_base); \
1049 \
1050 if (CLG_(current_state).collect && loaded->use_base) { \
1051 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
1052 (loaded->use_base)[off_##L##_SpLoss] += c; \
1053 \
1054 /* FIXME (?): L1/L2 line sizes must be equal ! */ \
1055 loaded->dep_use->mask |= use->mask; \
1056 loaded->dep_use->count += use->count; \
1057 } \
1058 } \
1059 \
1060 use->count = 1; \
1061 use->mask = mask; \
1062 loaded->memline = memline; \
1063 loaded->iaddr = bb_base + current_ii->instr_offset; \
1064 loaded->use_base = (CLG_(current_state).nonskipped) ? \
1065 CLG_(current_state).nonskipped->skipped : \
1066 cost_base + current_ii->cost_offset; \
1067 \
1068 if (memline == 0) return L2_Hit; \
1069 return cacheuse_L2_access(memline, loaded); \
1070}
1071
1072UPDATE_USE(I1);
1073UPDATE_USE(D1);
1074
1075CACHEUSE(I1);
1076CACHEUSE(D1);
1077
1078
1079static
1080void cacheuse_finish(void)
1081{
1082 int i;
1083 InstrInfo ii = { 0,0,0,0,0 };
1084
1085 if (!CLG_(current_state).collect) return;
1086
1087 bb_base = 0;
1088 current_ii = &ii;
1089 cost_base = 0;
1090
1091 /* update usage counters */
1092 if (I1.use)
1093 for (i = 0; i < I1.sets * I1.assoc; i++)
1094 if (I1.loaded[i].use_base)
1095 update_I1_use( &I1, i, 0,0);
1096
1097 if (D1.use)
1098 for (i = 0; i < D1.sets * D1.assoc; i++)
1099 if (D1.loaded[i].use_base)
1100 update_D1_use( &D1, i, 0,0);
1101
1102 if (L2.use)
1103 for (i = 0; i < L2.sets * L2.assoc; i++)
1104 if (L2.loaded[i].use_base)
1105 update_L2_use(i, 0);
1106}
1107
1108
1109
1110/*------------------------------------------------------------*/
1111/*--- Helper functions called by instrumented code ---*/
1112/*------------------------------------------------------------*/
1113
1114
1115static __inline__
1116void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1117{
1118 switch(r) {
1119 case WriteBackMemAccess:
1120 if (clo_simulate_writeback) {
1121 c1[3]++;
1122 c2[3]++;
1123 }
1124 // fall through
1125
1126 case MemAccess:
1127 c1[2]++;
1128 c2[2]++;
1129 // fall through
1130
1131 case L2_Hit:
1132 c1[1]++;
1133 c2[1]++;
1134 // fall through
1135
1136 default:
1137 c1[0]++;
1138 c2[0]++;
1139 }
1140}
1141
1142
1143VG_REGPARM(1)
1144static void log_1I0D(InstrInfo* ii)
1145{
1146 CacheModelResult IrRes;
1147
1148 current_ii = ii;
1149 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1150
1151 CLG_DEBUG(6, "log_1I0D: Ir=%p/%u => Ir %d\n",
1152 bb_base + ii->instr_offset, ii->instr_size, IrRes);
1153
1154 if (CLG_(current_state).collect) {
1155 ULong* cost_Ir;
1156
1157 if (CLG_(current_state).nonskipped)
1158 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1159 else
1160 cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
1161
1162 inc_costs(IrRes, cost_Ir,
1163 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1164 }
1165}
1166
1167
1168/* Instruction doing a read access */
1169
1170VG_REGPARM(2)
1171static void log_1I1Dr(InstrInfo* ii, Addr data)
1172{
1173 CacheModelResult IrRes, DrRes;
1174
1175 current_ii = ii;
1176 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1177 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1178
1179 CLG_DEBUG(6, "log_1I1Dr: Ir=%p/%u, Dr=%p/%u => Ir %d, Dr %d\n",
1180 bb_base + ii->instr_offset, ii->instr_size,
1181 data, ii->data_size, IrRes, DrRes);
1182
1183 if (CLG_(current_state).collect) {
1184 ULong *cost_Ir, *cost_Dr;
1185
1186 if (CLG_(current_state).nonskipped) {
1187 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1188 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1189 }
1190 else {
1191 cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
1192 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1193 }
1194
1195 inc_costs(IrRes, cost_Ir,
1196 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1197 inc_costs(DrRes, cost_Dr,
1198 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1199 }
1200}
1201
1202
1203VG_REGPARM(2)
1204static void log_0I1Dr(InstrInfo* ii, Addr data)
1205{
1206 CacheModelResult DrRes;
1207
1208 current_ii = ii;
1209 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1210
1211 CLG_DEBUG(6, "log_0I1Dr: Dr=%p/%u => Dr %d\n",
1212 data, ii->data_size, DrRes);
1213
1214 if (CLG_(current_state).collect) {
1215 ULong *cost_Dr;
1216
1217 if (CLG_(current_state).nonskipped) {
1218 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1219 }
1220 else {
1221 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1222 }
1223
1224 inc_costs(DrRes, cost_Dr,
1225 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1226 }
1227}
1228
1229
1230/* Instruction doing a write access */
1231
1232VG_REGPARM(2)
1233static void log_1I1Dw(InstrInfo* ii, Addr data)
1234{
1235 CacheModelResult IrRes, DwRes;
1236
1237 current_ii = ii;
1238 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1239 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1240
1241 CLG_DEBUG(6, "log_1I1Dw: Ir=%p/%u, Dw=%p/%u => Ir %d, Dw %d\n",
1242 bb_base + ii->instr_offset, ii->instr_size,
1243 data, ii->data_size, IrRes, DwRes);
1244
1245 if (CLG_(current_state).collect) {
1246 ULong *cost_Ir, *cost_Dw;
1247
1248 if (CLG_(current_state).nonskipped) {
1249 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1250 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1251 }
1252 else {
1253 cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
1254 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1255 }
1256
1257 inc_costs(IrRes, cost_Ir,
1258 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1259 inc_costs(DwRes, cost_Dw,
1260 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1261 }
1262}
1263
1264VG_REGPARM(2)
1265static void log_0I1Dw(InstrInfo* ii, Addr data)
1266{
1267 CacheModelResult DwRes;
1268
1269 current_ii = ii;
1270 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1271
1272 CLG_DEBUG(6, "log_0I1Dw: Dw=%p/%u => Dw %d\n",
1273 data, ii->data_size, DwRes);
1274
1275 if (CLG_(current_state).collect) {
1276 ULong *cost_Dw;
1277
1278 if (CLG_(current_state).nonskipped) {
1279 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1280 }
1281 else {
1282 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1283 }
1284
1285 inc_costs(DwRes, cost_Dw,
1286 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1287 }
1288}
1289
1290/* Instruction doing a read and a write access */
1291
1292VG_REGPARM(3)
1293static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
1294{
1295 CacheModelResult IrRes, DrRes, DwRes;
1296
1297 current_ii = ii;
1298 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1299 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1300 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1301
1302 CLG_DEBUG(6,
1303 "log_1I2D: Ir=%p/%u, Dr=%p/%u, Dw=%p/%u => Ir %d, Dr %d, Dw %d\n",
1304 bb_base + ii->instr_offset, ii->instr_size,
1305 data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
1306
1307 if (CLG_(current_state).collect) {
1308 ULong *cost_Ir, *cost_Dr, *cost_Dw;
1309
1310 if (CLG_(current_state).nonskipped) {
1311 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1312 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1313 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1314 }
1315 else {
1316 cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
1317 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1318 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1319 }
1320
1321 inc_costs(IrRes, cost_Ir,
1322 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1323 inc_costs(DrRes, cost_Dr,
1324 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1325 inc_costs(DwRes, cost_Dw,
1326 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1327 }
1328}
1329
1330VG_REGPARM(3)
1331static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
1332{
1333 CacheModelResult DrRes, DwRes;
1334
1335 current_ii = ii;
1336 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1337 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1338
1339 CLG_DEBUG(6,
1340 "log_0D2D: Dr=%p/%u, Dw=%p/%u => Dr %d, Dw %d\n",
1341 data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
1342
1343 if (CLG_(current_state).collect) {
1344 ULong *cost_Dr, *cost_Dw;
1345
1346 if (CLG_(current_state).nonskipped) {
1347 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1348 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1349 }
1350 else {
1351 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1352 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1353 }
1354
1355 inc_costs(DrRes, cost_Dr,
1356 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1357 inc_costs(DwRes, cost_Dw,
1358 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1359 }
1360}
1361
1362
1363/*------------------------------------------------------------*/
1364/*--- Cache configuration ---*/
1365/*------------------------------------------------------------*/
1366
1367#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
1368
1369static cache_t clo_I1_cache = UNDEFINED_CACHE;
1370static cache_t clo_D1_cache = UNDEFINED_CACHE;
1371static cache_t clo_L2_cache = UNDEFINED_CACHE;
1372
1373
1374/* Checks cache config is ok; makes it so if not. */
1375static
1376void check_cache(cache_t* cache, Char *name)
1377{
1378 /* First check they're all powers of two */
1379 if (-1 == VG_(log2)(cache->size)) {
1380 VG_(message)(Vg_UserMsg,
1381 "error: %s size of %dB not a power of two; aborting.",
1382 name, cache->size);
1383 VG_(exit)(1);
1384 }
1385
1386 if (-1 == VG_(log2)(cache->assoc)) {
1387 VG_(message)(Vg_UserMsg,
1388 "error: %s associativity of %d not a power of two; aborting.",
1389 name, cache->assoc);
1390 VG_(exit)(1);
1391 }
1392
1393 if (-1 == VG_(log2)(cache->line_size)) {
1394 VG_(message)(Vg_UserMsg,
1395 "error: %s line size of %dB not a power of two; aborting.",
1396 name, cache->line_size);
1397 VG_(exit)(1);
1398 }
1399
1400 // Then check line size >= 16 -- any smaller and a single instruction could
1401 // straddle three cache lines, which breaks a simulation assertion and is
1402 // stupid anyway.
1403 if (cache->line_size < MIN_LINE_SIZE) {
1404 VG_(message)(Vg_UserMsg,
1405 "error: %s line size of %dB too small; aborting.",
1406 name, cache->line_size);
1407 VG_(exit)(1);
1408 }
1409
1410 /* Then check cache size > line size (causes seg faults if not). */
1411 if (cache->size <= cache->line_size) {
1412 VG_(message)(Vg_UserMsg,
1413 "error: %s cache size of %dB <= line size of %dB; aborting.",
1414 name, cache->size, cache->line_size);
1415 VG_(exit)(1);
1416 }
1417
1418 /* Then check assoc <= (size / line size) (seg faults otherwise). */
1419 if (cache->assoc > (cache->size / cache->line_size)) {
1420 VG_(message)(Vg_UserMsg,
1421 "warning: %s associativity > (size / line size); aborting.", name);
1422 VG_(exit)(1);
1423 }
1424}
1425
1426static
1427void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1428{
1429#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
1430
1431 Int n_clos = 0;
1432
1433 // Count how many were defined on the command line.
1434 if (DEFINED(clo_I1_cache)) { n_clos++; }
1435 if (DEFINED(clo_D1_cache)) { n_clos++; }
1436 if (DEFINED(clo_L2_cache)) { n_clos++; }
1437
1438 // Set the cache config (using auto-detection, if supported by the
1439 // architecture)
1440 VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1441
1442 // Then replace with any defined on the command line.
1443 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1444 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1445 if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1446
1447 // Then check values and fix if not acceptable.
1448 check_cache(I1c, "I1");
1449 check_cache(D1c, "D1");
1450 check_cache(L2c, "L2");
1451
1452 if (VG_(clo_verbosity) > 1) {
1453 VG_(message)(Vg_UserMsg, "Cache configuration used:");
1454 VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
1455 I1c->size, I1c->assoc, I1c->line_size);
1456 VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
1457 D1c->size, D1c->assoc, D1c->line_size);
1458 VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
1459 L2c->size, L2c->assoc, L2c->line_size);
1460 }
1461#undef CMD_LINE_DEFINED
1462}
1463
1464
1465/* Initialize and clear simulator state */
1466static void cachesim_post_clo_init(void)
1467{
1468 /* Cache configurations. */
1469 cache_t I1c, D1c, L2c;
1470
1471 /* Initialize access handlers */
1472 if (!CLG_(clo).simulate_cache) {
1473 CLG_(cachesim).log_1I0D = 0;
1474 CLG_(cachesim).log_1I0D_name = "(no function)";
1475
1476 CLG_(cachesim).log_1I1Dr = 0;
1477 CLG_(cachesim).log_1I1Dw = 0;
1478 CLG_(cachesim).log_1I2D = 0;
1479 CLG_(cachesim).log_1I1Dr_name = "(no function)";
1480 CLG_(cachesim).log_1I1Dw_name = "(no function)";
1481 CLG_(cachesim).log_1I2D_name = "(no function)";
1482
1483 CLG_(cachesim).log_0I1Dr = 0;
1484 CLG_(cachesim).log_0I1Dw = 0;
1485 CLG_(cachesim).log_0I2D = 0;
1486 CLG_(cachesim).log_0I1Dr_name = "(no function)";
1487 CLG_(cachesim).log_0I1Dw_name = "(no function)";
1488 CLG_(cachesim).log_0I2D_name = "(no function)";
1489 return;
1490 }
1491
1492 /* Configuration of caches only needed with real cache simulation */
1493 configure_caches(&I1c, &D1c, &L2c);
1494
1495 I1.name = "I1";
1496 D1.name = "D1";
1497 L2.name = "L2";
1498
1499 cachesim_initcache(I1c, &I1);
1500 cachesim_initcache(D1c, &D1);
1501 cachesim_initcache(L2c, &L2);
1502
1503 /* the other cache simulators use the standard helpers
1504 * with dispatching via simulator struct */
1505
1506 CLG_(cachesim).log_1I0D = log_1I0D;
1507 CLG_(cachesim).log_1I0D_name = "log_1I0D";
1508
1509 CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1510 CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1511 CLG_(cachesim).log_1I2D = log_1I2D;
1512 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1513 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1514 CLG_(cachesim).log_1I2D_name = "log_1I2D";
1515
1516 CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1517 CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1518 CLG_(cachesim).log_0I2D = log_0I2D;
1519 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1520 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1521 CLG_(cachesim).log_0I2D_name = "log_0I2D";
1522
1523 if (clo_collect_cacheuse) {
1524
1525 /* Output warning for not supported option combinations */
1526 if (clo_simulate_hwpref) {
1527 VG_(message)(Vg_DebugMsg,
1528 "warning: prefetch simulation can not be used with cache usage");
1529 clo_simulate_hwpref = False;
1530 }
1531
1532 if (clo_simulate_writeback) {
1533 VG_(message)(Vg_DebugMsg,
1534 "warning: write-back simulation can not be used with cache usage");
1535 clo_simulate_writeback = False;
1536 }
1537
1538 simulator.I1_Read = cacheuse_I1_doRead;
1539 simulator.D1_Read = cacheuse_D1_doRead;
1540 simulator.D1_Write = cacheuse_D1_doRead;
1541 return;
1542 }
1543
1544 if (clo_simulate_hwpref) {
1545 prefetch_clear();
1546
1547 if (clo_simulate_writeback) {
1548 simulator.I1_Read = prefetch_I1_Read;
1549 simulator.D1_Read = prefetch_D1_Read;
1550 simulator.D1_Write = prefetch_D1_Write;
1551 }
1552 else {
1553 simulator.I1_Read = prefetch_I1_ref;
1554 simulator.D1_Read = prefetch_D1_ref;
1555 simulator.D1_Write = prefetch_D1_ref;
1556 }
1557
1558 return;
1559 }
1560
1561 if (clo_simulate_writeback) {
1562 simulator.I1_Read = cachesim_I1_Read;
1563 simulator.D1_Read = cachesim_D1_Read;
1564 simulator.D1_Write = cachesim_D1_Write;
1565 }
1566 else {
1567 simulator.I1_Read = cachesim_I1_ref;
1568 simulator.D1_Read = cachesim_D1_ref;
1569 simulator.D1_Write = cachesim_D1_ref;
1570 }
1571}
1572
1573
1574/* Clear simulator state. Has to be initialized before */
1575static
1576void cachesim_clear(void)
1577{
1578 cachesim_clearcache(&I1);
1579 cachesim_clearcache(&D1);
1580 cachesim_clearcache(&L2);
1581
1582 prefetch_clear();
1583}
1584
1585
1586static void cachesim_getdesc(Char* buf)
1587{
1588 Int p;
1589 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1590 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1591 VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1592}
1593
1594static
1595void cachesim_print_opts(void)
1596{
1597 VG_(printf)(
1598"\n cache simulator options:\n"
1599" --simulate-cache=no|yes Do cache simulation [no]\n"
1600" --simulate-wb=no|yes Count write-back events [no]\n"
1601" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1602#if CLG_EXPERIMENTAL
1603" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1604#endif
1605" --cacheuse=no|yes Collect cache block use [no]\n"
1606" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
1607" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
1608" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
1609 );
1610}
1611
1612static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
1613{
1614 int i1, i2, i3;
1615 int i;
1616 char *opt = VG_(strdup)(orig_opt);
1617
1618 i = i1 = opt_len;
1619
1620 /* Option looks like "--I1=65536,2,64".
1621 * Find commas, replace with NULs to make three independent
1622 * strings, then extract numbers. Yuck. */
1623 while (VG_(isdigit)(opt[i])) i++;
1624 if (',' == opt[i]) {
1625 opt[i++] = '\0';
1626 i2 = i;
1627 } else goto bad;
1628 while (VG_(isdigit)(opt[i])) i++;
1629 if (',' == opt[i]) {
1630 opt[i++] = '\0';
1631 i3 = i;
1632 } else goto bad;
1633 while (VG_(isdigit)(opt[i])) i++;
1634 if ('\0' != opt[i]) goto bad;
1635
1636 cache->size = (Int)VG_(atoll)(opt + i1);
1637 cache->assoc = (Int)VG_(atoll)(opt + i2);
1638 cache->line_size = (Int)VG_(atoll)(opt + i3);
1639
1640 VG_(free)(opt);
1641
1642 return;
1643
1644 bad:
sewardj6893d652006-10-15 01:25:13 +00001645 VG_(err_bad_option)(orig_opt);
weidendoa17f2a32006-03-20 10:27:30 +00001646}
1647
1648/* Check for command line option for cache configuration.
1649 * Return False if unknown and not handled.
1650 *
1651 * Called from CLG_(process_cmd_line_option)() in clo.c
1652 */
1653static Bool cachesim_parse_opt(Char* arg)
1654{
1655 if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
1656 clo_simulate_writeback = True;
1657 else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
1658 clo_simulate_writeback = False;
1659
1660 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
1661 clo_simulate_hwpref = True;
1662 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
1663 clo_simulate_hwpref = False;
1664
1665 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
1666 clo_simulate_sectors = True;
1667 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
1668 clo_simulate_sectors = False;
1669
1670 else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
1671 clo_collect_cacheuse = True;
1672 /* Use counters only make sense with fine dumping */
1673 CLG_(clo).dump_instr = True;
1674 }
1675 else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
1676 clo_collect_cacheuse = False;
1677
1678 /* 5 is length of "--I1=" */
1679 else if (0 == VG_(strncmp)(arg, "--I1=", 5))
1680 parse_opt(&clo_I1_cache, arg, 5);
1681 else if (0 == VG_(strncmp)(arg, "--D1=", 5))
1682 parse_opt(&clo_D1_cache, arg, 5);
1683 else if (0 == VG_(strncmp)(arg, "--L2=", 5))
1684 parse_opt(&clo_L2_cache, arg, 5);
1685 else
1686 return False;
1687
1688 return True;
1689}
1690
1691/* Adds commas to ULong, right justifying in a field field_width wide, returns
1692 * the string in buf. */
1693static
1694Int commify(ULong n, int field_width, char* buf)
1695{
1696 int len, n_commas, i, j, new_len, space;
1697
1698 VG_(sprintf)(buf, "%llu", n);
1699 len = VG_(strlen)(buf);
1700 n_commas = (len - 1) / 3;
1701 new_len = len + n_commas;
1702 space = field_width - new_len;
1703
1704 /* Allow for printing a number in a field_width smaller than it's size */
1705 if (space < 0) space = 0;
1706
1707 /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1708 * of three. */
1709 for (j = -1, i = len ; i >= 0; i--) {
1710 buf[i + n_commas + space] = buf[i];
1711
1712 if ((i>0) && (3 == ++j)) {
1713 j = 0;
1714 n_commas--;
1715 buf[i + n_commas + space] = ',';
1716 }
1717 }
1718 /* Right justify in field. */
1719 for (i = 0; i < space; i++) buf[i] = ' ';
1720 return new_len;
1721}
1722
1723static
1724void percentify(Int n, Int ex, Int field_width, char buf[])
1725{
1726 int i, len, space;
1727
1728 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1729 len = VG_(strlen)(buf);
1730 space = field_width - len;
1731 if (space < 0) space = 0; /* Allow for v. small field_width */
1732 i = len;
1733
1734 /* Right justify in field */
1735 for ( ; i >= 0; i--) buf[i + space] = buf[i];
1736 for (i = 0; i < space; i++) buf[i] = ' ';
1737}
1738
1739static
1740void cachesim_printstat(void)
1741{
1742 FullCost total = CLG_(total_cost), D_total = 0;
1743 ULong L2_total_m, L2_total_mr, L2_total_mw,
1744 L2_total, L2_total_r, L2_total_w;
1745 char buf1[RESULTS_BUF_LEN],
1746 buf2[RESULTS_BUF_LEN],
1747 buf3[RESULTS_BUF_LEN];
1748 Int l1, l2, l3;
1749 Int p;
1750
1751 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1752 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
1753 prefetch_up);
1754 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
1755 prefetch_down);
1756 VG_(message)(Vg_DebugMsg, "");
1757 }
1758
1759 /* I cache results. Use the I_refs value to determine the first column
1760 * width. */
1761 l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1762 VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
1763
1764 if (!CLG_(clo).simulate_cache) return;
1765
1766 commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1767 VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
1768
1769 commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1770 VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
1771
1772 p = 100;
1773
1774 if (0 == total[CLG_(sets).off_full_Ir])
1775 total[CLG_(sets).off_full_Ir] = 1;
1776
1777 percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1778 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1779 VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
1780
1781 percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1782 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1783 VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
1784 VG_(message)(Vg_UserMsg, "");
1785
1786 /* D cache results.
1787 Use the D_refs.rd and D_refs.wr values to determine the
1788 * width of columns 2 & 3. */
1789
1790 D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1791 CLG_(init_cost)( CLG_(sets).full, D_total);
1792 CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1793 CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1794
1795 commify( D_total[0], l1, buf1);
1796 l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
1797 l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
1798 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
1799 buf1, buf2, buf3);
1800
1801 commify( D_total[1], l1, buf1);
1802 commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1803 commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1804 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
1805 buf1, buf2, buf3);
1806
1807 commify( D_total[2], l1, buf1);
1808 commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1809 commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1810 VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
1811 buf1, buf2, buf3);
1812
1813 p = 10;
1814
1815 if (0 == D_total[0]) D_total[0] = 1;
1816 if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1817 if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1818
1819 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
1820 percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1821 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1822 percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1823 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1824 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
1825
1826 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
1827 percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1828 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1829 percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1830 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1831 VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
1832 VG_(message)(Vg_UserMsg, "");
1833
1834
1835
1836 /* L2 overall results */
1837
1838 L2_total =
1839 total[CLG_(sets).off_full_Dr +1] +
1840 total[CLG_(sets).off_full_Dw +1] +
1841 total[CLG_(sets).off_full_Ir +1];
1842 L2_total_r =
1843 total[CLG_(sets).off_full_Dr +1] +
1844 total[CLG_(sets).off_full_Ir +1];
1845 L2_total_w = total[CLG_(sets).off_full_Dw +1];
1846 commify(L2_total, l1, buf1);
1847 commify(L2_total_r, l2, buf2);
1848 commify(L2_total_w, l3, buf3);
1849 VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
1850 buf1, buf2, buf3);
1851
1852 L2_total_m =
1853 total[CLG_(sets).off_full_Dr +2] +
1854 total[CLG_(sets).off_full_Dw +2] +
1855 total[CLG_(sets).off_full_Ir +2];
1856 L2_total_mr =
1857 total[CLG_(sets).off_full_Dr +2] +
1858 total[CLG_(sets).off_full_Ir +2];
1859 L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1860 commify(L2_total_m, l1, buf1);
1861 commify(L2_total_mr, l2, buf2);
1862 commify(L2_total_mw, l3, buf3);
1863 VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
1864 buf1, buf2, buf3);
1865
1866 percentify(L2_total_m * 100 * p /
1867 (total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
1868 percentify(L2_total_mr * 100 * p /
1869 (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1870 p, l2+1, buf2);
1871 percentify(L2_total_mw * 100 * p /
1872 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1873 VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
1874 buf1, buf2,buf3);
1875}
1876
1877
1878/*------------------------------------------------------------*/
1879/*--- Setup for Event set. ---*/
1880/*------------------------------------------------------------*/
1881
1882struct event_sets CLG_(sets);
1883
1884void CLG_(init_eventsets)(Int max_user)
1885{
1886 EventType * e1, *e2, *e3, *e4;
1887 EventSet *Ir, *Dr, *Dw;
1888 EventSet *D0, *D1r, *D1w, *D2;
1889 EventSet *sim, *full;
1890 EventSet *use;
1891 int sizeOfUseIr;
1892
1893 use = CLG_(get_eventset)("Use", 4);
1894 if (clo_collect_cacheuse) {
1895 /* if TUse is 0, there was never a load, and no loss, too */
1896 e1 = CLG_(register_eventtype)("AcCost1");
1897 CLG_(add_eventtype)(use, e1);
1898 e1 = CLG_(register_eventtype)("SpLoss1");
1899 CLG_(add_eventtype)(use, e1);
1900 e1 = CLG_(register_eventtype)("AcCost2");
1901 CLG_(add_eventtype)(use, e1);
1902 e1 = CLG_(register_eventtype)("SpLoss2");
1903 CLG_(add_eventtype)(use, e1);
1904 }
1905
1906 Ir = CLG_(get_eventset)("Ir", 4);
1907 Dr = CLG_(get_eventset)("Dr", 4);
1908 Dw = CLG_(get_eventset)("Dw", 4);
1909 if (CLG_(clo).simulate_cache) {
1910 e1 = CLG_(register_eventtype)("Ir");
1911 e2 = CLG_(register_eventtype)("I1mr");
1912 e3 = CLG_(register_eventtype)("I2mr");
1913 if (clo_simulate_writeback) {
1914 e4 = CLG_(register_eventtype)("I2dmr");
1915 CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1916 }
1917 else
1918 CLG_(add_dep_event3)(Ir, e1,e2,e3);
1919
1920 e1 = CLG_(register_eventtype)("Dr");
1921 e2 = CLG_(register_eventtype)("D1mr");
1922 e3 = CLG_(register_eventtype)("D2mr");
1923 if (clo_simulate_writeback) {
1924 e4 = CLG_(register_eventtype)("D2dmr");
1925 CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1926 }
1927 else
1928 CLG_(add_dep_event3)(Dr, e1,e2,e3);
1929
1930 e1 = CLG_(register_eventtype)("Dw");
1931 e2 = CLG_(register_eventtype)("D1mw");
1932 e3 = CLG_(register_eventtype)("D2mw");
1933 if (clo_simulate_writeback) {
1934 e4 = CLG_(register_eventtype)("D2dmw");
1935 CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1936 }
1937 else
1938 CLG_(add_dep_event3)(Dw, e1,e2,e3);
1939
1940 }
1941 else {
1942 e1 = CLG_(register_eventtype)("Ir");
1943 CLG_(add_eventtype)(Ir, e1);
1944 }
1945
1946 sizeOfUseIr = use->size + Ir->size;
1947 D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
1948 CLG_(add_eventset)(D0, use);
1949 off_D0_Ir = CLG_(add_eventset)(D0, Ir);
1950
1951 D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
1952 CLG_(add_eventset)(D1r, use);
1953 off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
1954 off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
1955
1956 D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
1957 CLG_(add_eventset)(D1w, use);
1958 off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
1959 off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
1960
1961 D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
1962 CLG_(add_eventset)(D2, use);
1963 off_D2_Ir = CLG_(add_eventset)(D2, Ir);
1964 off_D2_Dr = CLG_(add_eventset)(D2, Dr);
1965 off_D2_Dw = CLG_(add_eventset)(D2, Dw);
1966
1967 sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
1968 CLG_(add_eventset)(sim, use);
1969 CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
1970 CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
1971 CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
1972
1973 if (CLG_(clo).collect_alloc) max_user += 2;
1974 if (CLG_(clo).collect_systime) max_user += 2;
1975
1976 full = CLG_(get_eventset)("full", sim->size + max_user);
1977 CLG_(add_eventset)(full, sim);
1978 CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
1979 CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
1980 CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
1981
1982 CLG_(sets).use = use;
1983 CLG_(sets).Ir = Ir;
1984 CLG_(sets).Dr = Dr;
1985 CLG_(sets).Dw = Dw;
1986
1987 CLG_(sets).D0 = D0;
1988 CLG_(sets).D1r = D1r;
1989 CLG_(sets).D1w = D1w;
1990 CLG_(sets).D2 = D2;
1991
1992 CLG_(sets).sim = sim;
1993 CLG_(sets).full = full;
1994
1995 if (CLG_(clo).collect_alloc) {
1996 e1 = CLG_(register_eventtype)("allocCount");
1997 e2 = CLG_(register_eventtype)("allocSize");
1998 CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
1999 }
2000
2001 if (CLG_(clo).collect_systime) {
2002 e1 = CLG_(register_eventtype)("sysCount");
2003 e2 = CLG_(register_eventtype)("sysTime");
2004 CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
2005 }
2006
2007 CLG_DEBUGIF(1) {
2008 CLG_DEBUG(1, "EventSets:\n");
2009 CLG_(print_eventset)(-2, use);
2010 CLG_(print_eventset)(-2, Ir);
2011 CLG_(print_eventset)(-2, Dr);
2012 CLG_(print_eventset)(-2, Dw);
2013 CLG_(print_eventset)(-2, sim);
2014 CLG_(print_eventset)(-2, full);
2015 }
2016
2017 /* Not-existing events are silently ignored */
2018 CLG_(dumpmap) = CLG_(get_eventmapping)(full);
2019 CLG_(append_event)(CLG_(dumpmap), "Ir");
2020 CLG_(append_event)(CLG_(dumpmap), "Dr");
2021 CLG_(append_event)(CLG_(dumpmap), "Dw");
2022 CLG_(append_event)(CLG_(dumpmap), "I1mr");
2023 CLG_(append_event)(CLG_(dumpmap), "D1mr");
2024 CLG_(append_event)(CLG_(dumpmap), "D1mw");
2025 CLG_(append_event)(CLG_(dumpmap), "I2mr");
2026 CLG_(append_event)(CLG_(dumpmap), "D2mr");
2027 CLG_(append_event)(CLG_(dumpmap), "D2mw");
2028 CLG_(append_event)(CLG_(dumpmap), "I2dmr");
2029 CLG_(append_event)(CLG_(dumpmap), "D2dmr");
2030 CLG_(append_event)(CLG_(dumpmap), "D2dmw");
2031 CLG_(append_event)(CLG_(dumpmap), "AcCost1");
2032 CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
2033 CLG_(append_event)(CLG_(dumpmap), "AcCost2");
2034 CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
2035 CLG_(append_event)(CLG_(dumpmap), "allocCount");
2036 CLG_(append_event)(CLG_(dumpmap), "allocSize");
2037 CLG_(append_event)(CLG_(dumpmap), "sysCount");
2038 CLG_(append_event)(CLG_(dumpmap), "sysTime");
2039
2040}
2041
2042
2043
2044static
2045void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
2046{
2047 /* if eventset use is defined, it is always first (hardcoded!) */
2048 CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
2049
2050 /* FIXME: This is hardcoded... */
2051 if (es == CLG_(sets).D0) {
2052 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2053 cost + off_D0_Ir);
2054 }
2055 else if (es == CLG_(sets).D1r) {
2056 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2057 cost + off_D1r_Ir);
2058 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2059 cost + off_D1r_Dr);
2060 }
2061 else if (es == CLG_(sets).D1w) {
2062 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2063 cost + off_D1w_Ir);
2064 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2065 cost + off_D1w_Dw);
2066 }
2067 else {
2068 CLG_ASSERT(es == CLG_(sets).D2);
2069 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2070 cost + off_D2_Ir);
2071 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2072 cost + off_D2_Dr);
2073 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2074 cost + off_D2_Dw);
2075 }
2076}
2077
2078/* this is called at dump time for every instruction executed */
2079static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
2080 InstrInfo* ii, ULong exe_count)
2081{
2082 if (!CLG_(clo).simulate_cache)
2083 cost[CLG_(sets).off_sim_Ir] += exe_count;
2084 else {
2085
2086#if 0
2087/* There is always a trivial case where exe_count and Ir can be
2088 * slightly different because ecounter is updated when executing
2089 * the next BB. E.g. for last BB executed, or when toggling collection
2090 */
2091 /* FIXME: Hardcoded that each eventset has Ir as first */
2092 if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
2093 VG_(printf)("==> Ir %llu, exe %llu\n",
2094 (bbcc->cost + ii->cost_offset)[0], exe_count);
2095 CLG_(print_bbcc_cost)(-2, bbcc);
2096 //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
2097 }
2098#endif
2099
2100 add_and_zero_Dx(ii->eventset, cost,
2101 bbcc->cost + ii->cost_offset);
2102 }
2103}
2104
2105static
2106void cachesim_after_bbsetup(void)
2107{
2108 BBCC* bbcc = CLG_(current_state).bbcc;
2109
2110 if (CLG_(clo).simulate_cache) {
2111 BB* bb = bbcc->bb;
2112
2113 /* only needed if log_* functions are called */
2114 bb_base = bb->obj->offset + bb->offset;
2115 cost_base = bbcc->cost;
2116 }
2117}
2118
2119static
2120void cachesim_finish(void)
2121{
2122 if (clo_collect_cacheuse)
2123 cacheuse_finish();
2124}
2125
2126/*------------------------------------------------------------*/
2127/*--- The simulator defined in this file ---*/
2128/*------------------------------------------------------------*/
2129
2130struct cachesim_if CLG_(cachesim) = {
2131 .print_opts = cachesim_print_opts,
2132 .parse_opt = cachesim_parse_opt,
2133 .post_clo_init = cachesim_post_clo_init,
2134 .clear = cachesim_clear,
2135 .getdesc = cachesim_getdesc,
2136 .printstat = cachesim_printstat,
2137 .add_icost = cachesim_add_icost,
2138 .after_bbsetup = cachesim_after_bbsetup,
2139 .finish = cachesim_finish,
2140
2141 /* these will be set by cachesim_post_clo_init */
2142 .log_1I0D = 0,
2143
2144 .log_1I1Dr = 0,
2145 .log_1I1Dw = 0,
2146 .log_1I2D = 0,
2147
2148 .log_0I1Dr = 0,
2149 .log_0I1Dw = 0,
2150 .log_0I2D = 0,
2151
2152 .log_1I0D_name = "(no function)",
2153
2154 .log_1I1Dr_name = "(no function)",
2155 .log_1I1Dw_name = "(no function)",
2156 .log_1I2D_name = "(no function)",
2157
2158 .log_0I1Dr_name = "(no function)",
2159 .log_0I1Dw_name = "(no function)",
2160 .log_0I2D_name = "(no function)"
2161};
2162
2163
2164/*--------------------------------------------------------------------*/
2165/*--- end ct_sim.c ---*/
2166/*--------------------------------------------------------------------*/
2167