blob: e61eb6971c5f3f7ceab54224a4b619a2abd8f404 [file] [log] [blame]
weidendoa17f2a32006-03-20 10:27:30 +00001
2/*--------------------------------------------------------------------*/
3/*--- Cache simulation. ---*/
4/*--- sim.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8 This file is part of Callgrind.
9 (c) 2003-2005, Josef Weidendorfer
10
11 Parts are Copyright (C) 2002 Nicholas Nethercote
12 njn25@cam.ac.uk
13
14
15 This program is free software; you can redistribute it and/or
16 modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation; either version 2 of the
18 License, or (at your option) any later version.
19
20 This program is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 General Public License for more details.
24
25 You should have received a copy of the GNU General Public License
26 along with this program; if not, write to the Free Software
27 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 02111-1307, USA.
29
30 The GNU General Public License is contained in the file COPYING.
31*/
32
33#include "global.h"
34
35
36/* Notes:
37 - simulates a write-allocate cache
38 - (block --> set) hash function uses simple bit selection
39 - handling of references straddling two cache blocks:
40 - counts as only one cache access (not two)
41 - both blocks hit --> one hit
42 - one block hits, the other misses --> one miss
43 - both blocks miss --> one miss (not two)
44*/
45
46/* Cache configuration */
47#include "cg_arch.h"
48
49/* additional structures for cache use info, separated
50 * according usage frequency:
51 * - line_loaded : pointer to cost center of instruction
52 * which loaded the line into cache.
53 * Needed to increment counters when line is evicted.
54 * - line_use : updated on every access
55 */
56typedef struct {
57 UInt count;
58 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
59} line_use;
60
61typedef struct {
62 Addr memline, iaddr;
63 line_use* dep_use; /* point to higher-level cacheblock for this memline */
64 ULong* use_base;
65} line_loaded;
66
67/* Cache state */
68typedef struct {
69 char* name;
70 int size; /* bytes */
71 int assoc;
72 int line_size; /* bytes */
73 Bool sectored; /* prefetch nearside cacheline on read */
74 int sets;
75 int sets_min_1;
76 int assoc_bits;
77 int line_size_bits;
78 int tag_shift;
79 UWord tag_mask;
80 char desc_line[128];
81 UWord* tags;
82
83 /* for cache use */
84 int line_size_mask;
85 int* line_start_mask;
86 int* line_end_mask;
87 line_loaded* loaded;
88 line_use* use;
89} cache_t2;
90
91/*
92 * States of flat caches in our model.
93 * We use a 2-level hierarchy,
94 */
95static cache_t2 I1, D1, L2;
96
97/* Lower bits of cache tags are used as flags for a cache line */
98#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
99#define CACHELINE_DIRTY 1
100
101
102/* Cache simulator Options */
103static Bool clo_simulate_writeback = False;
104static Bool clo_simulate_hwpref = False;
105static Bool clo_simulate_sectors = False;
106static Bool clo_collect_cacheuse = False;
107
108/* Following global vars are setup before by
109 * setup_bbcc()/cachesim_after_bbsetup():
110 *
111 * - Addr bb_base (instruction start address of original BB)
112 * - ULong* cost_base (start of cost array for BB)
113 * - BBCC* nonskipped (only != 0 when in a function not skipped)
114 */
115
116/* Offset to events in event set, used in log_* functions */
117static Int off_D0_Ir;
118static Int off_D1r_Ir;
119static Int off_D1r_Dr;
120static Int off_D1w_Ir;
121static Int off_D1w_Dw;
122static Int off_D2_Ir;
123static Int off_D2_Dr;
124static Int off_D2_Dw;
125
126static Addr bb_base;
127static ULong* cost_base;
128static InstrInfo* current_ii;
129
130/* Cache use offsets */
131/* FIXME: The offsets are only correct because all eventsets get
132 * the "Use" set added first !
133 */
134static Int off_I1_AcCost = 0;
135static Int off_I1_SpLoss = 1;
136static Int off_D1_AcCost = 0;
137static Int off_D1_SpLoss = 1;
138static Int off_L2_AcCost = 2;
139static Int off_L2_SpLoss = 3;
140
141/* Cache access types */
142typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
143
144/* Result of a reference into a flat cache */
145typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
146
147/* Result of a reference into a hierarchical cache model */
148typedef enum {
149 L1_Hit,
150 L2_Hit,
151 MemAccess,
152 WriteBackMemAccess } CacheModelResult;
153
154typedef CacheModelResult (*simcall_type)(Addr, UChar);
155
156static struct {
157 simcall_type I1_Read;
158 simcall_type D1_Read;
159 simcall_type D1_Write;
160} simulator;
161
162/*------------------------------------------------------------*/
163/*--- Cache Simulator Initialization ---*/
164/*------------------------------------------------------------*/
165
166static void cachesim_clearcache(cache_t2* c)
167{
168 Int i;
169
170 for (i = 0; i < c->sets * c->assoc; i++)
171 c->tags[i] = 0;
172 if (c->use) {
173 for (i = 0; i < c->sets * c->assoc; i++) {
174 c->loaded[i].memline = 0;
175 c->loaded[i].use_base = 0;
176 c->loaded[i].dep_use = 0;
177 c->loaded[i].iaddr = 0;
178 c->use[i].mask = 0;
179 c->use[i].count = 0;
180 c->tags[i] = i % c->assoc; /* init lower bits as pointer */
181 }
182 }
183}
184
185static void cacheuse_initcache(cache_t2* c);
186
187/* By this point, the size/assoc/line_size has been checked. */
188static void cachesim_initcache(cache_t config, cache_t2* c)
189{
190 c->size = config.size;
191 c->assoc = config.assoc;
192 c->line_size = config.line_size;
193 c->sectored = False; // FIXME
194
195 c->sets = (c->size / c->line_size) / c->assoc;
196 c->sets_min_1 = c->sets - 1;
197 c->assoc_bits = VG_(log2)(c->assoc);
198 c->line_size_bits = VG_(log2)(c->line_size);
199 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
200 c->tag_mask = ~((1<<c->tag_shift)-1);
201
202 /* Can bits in tag entries be used for flags?
203 * Should be always true as MIN_LINE_SIZE >= 16 */
204 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
205
206 if (c->assoc == 1) {
207 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
208 c->size, c->line_size,
209 c->sectored ? ", sectored":"");
210 } else {
211 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
212 c->size, c->line_size, c->assoc,
213 c->sectored ? ", sectored":"");
214 }
215
216 c->tags = (UWord*) CLG_MALLOC(sizeof(UWord) * c->sets * c->assoc);
217 if (clo_collect_cacheuse)
218 cacheuse_initcache(c);
219 else
220 c->use = 0;
221 cachesim_clearcache(c);
222}
223
224
225#if 0
226static void print_cache(cache_t2* c)
227{
228 UInt set, way, i;
229
230 /* Note initialisation and update of 'i'. */
231 for (i = 0, set = 0; set < c->sets; set++) {
232 for (way = 0; way < c->assoc; way++, i++) {
233 VG_(printf)("%8x ", c->tags[i]);
234 }
235 VG_(printf)("\n");
236 }
237}
238#endif
239
240
241/*------------------------------------------------------------*/
242/*--- Write Through Cache Simulation ---*/
243/*------------------------------------------------------------*/
244
245/*
246 * Simple model: L1 & L2 Write Through
247 * Does not distinguish among read and write references
248 *
249 * Simulator functions:
250 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
251 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
252 */
253
254static __inline__
255CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
256{
257 int i, j;
258 UWord *set;
259
260 /* Shifting is a bit faster than multiplying */
261 set = &(c->tags[set_no << c->assoc_bits]);
262
263 /* This loop is unrolled for just the first case, which is the most */
264 /* common. We can't unroll any further because it would screw up */
265 /* if we have a direct-mapped (1-way) cache. */
266 if (tag == set[0])
267 return Hit;
268
269 /* If the tag is one other than the MRU, move it into the MRU spot */
270 /* and shuffle the rest down. */
271 for (i = 1; i < c->assoc; i++) {
272 if (tag == set[i]) {
273 for (j = i; j > 0; j--) {
274 set[j] = set[j - 1];
275 }
276 set[0] = tag;
277 return Hit;
278 }
279 }
280
281 /* A miss; install this tag as MRU, shuffle rest down. */
282 for (j = c->assoc - 1; j > 0; j--) {
283 set[j] = set[j - 1];
284 }
285 set[0] = tag;
286
287 return Miss;
288}
289
290static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
291{
292 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
293 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
294 UWord tag = a >> c->tag_shift;
295
296 /* Access entirely within line. */
297 if (set1 == set2)
298 return cachesim_setref(c, set1, tag);
299
300 /* Access straddles two lines. */
301 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
302 else if (((set1 + 1) & (c->sets-1)) == set2) {
303
304 /* the call updates cache structures as side effect */
305 CacheResult res1 = cachesim_setref(c, set1, tag);
306 CacheResult res2 = cachesim_setref(c, set2, tag);
307 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
308
309 } else {
310 VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
311 VG_(tool_panic)("item straddles more than two cache sets");
312 }
313 return Hit;
314}
315
316static
317CacheModelResult cachesim_I1_ref(Addr a, UChar size)
318{
319 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
320 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
321 return MemAccess;
322}
323
324static
325CacheModelResult cachesim_D1_ref(Addr a, UChar size)
326{
327 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
328 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
329 return MemAccess;
330}
331
332
333/*------------------------------------------------------------*/
334/*--- Write Back Cache Simulation ---*/
335/*------------------------------------------------------------*/
336
337/*
338 * More complex model: L1 Write-through, L2 Write-back
339 * This needs to distinguish among read and write references.
340 *
341 * Simulator functions:
342 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
343 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
344 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
345 */
346
347/*
348 * With write-back, result can be a miss evicting a dirty line
349 * The dirty state of a cache line is stored in Bit0 of the tag for
350 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
351 * type (Read/Write), the line gets dirty on a write.
352 */
353static __inline__
354CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
355{
356 int i, j;
357 UWord *set, tmp_tag;
358
359 /* Shifting is a bit faster than multiplying */
360 set = &(c->tags[set_no << c->assoc_bits]);
361
362 /* This loop is unrolled for just the first case, which is the most */
363 /* common. We can't unroll any further because it would screw up */
364 /* if we have a direct-mapped (1-way) cache. */
365 if (tag == (set[0] & ~CACHELINE_DIRTY)) {
366 set[0] |= ref;
367 return Hit;
368 }
369 /* If the tag is one other than the MRU, move it into the MRU spot */
370 /* and shuffle the rest down. */
371 for (i = 1; i < c->assoc; i++) {
372 if (tag == (set[i] & ~CACHELINE_DIRTY)) {
373 tmp_tag = set[i] | ref; // update dirty flag
374 for (j = i; j > 0; j--) {
375 set[j] = set[j - 1];
376 }
377 set[0] = tmp_tag;
378 return Hit;
379 }
380 }
381
382 /* A miss; install this tag as MRU, shuffle rest down. */
383 tmp_tag = set[c->assoc - 1];
384 for (j = c->assoc - 1; j > 0; j--) {
385 set[j] = set[j - 1];
386 }
387 set[0] = tag | ref;
388
389 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
390}
391
392
393static __inline__
394CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
395{
396 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
397 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
398 UWord tag = a & c->tag_mask;
399
400 /* Access entirely within line. */
401 if (set1 == set2)
402 return cachesim_setref_wb(c, ref, set1, tag);
403
404 /* Access straddles two lines. */
405 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
406 else if (((set1 + 1) & (c->sets-1)) == set2) {
407
408 /* the call updates cache structures as side effect */
409 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
410 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag);
411
412 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
413 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
414
415 } else {
416 VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
417 VG_(tool_panic)("item straddles more than two cache sets");
418 }
419 return Hit;
420}
421
422
423static
424CacheModelResult cachesim_I1_Read(Addr a, UChar size)
425{
426 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
427 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
428 case Hit: return L2_Hit;
429 case Miss: return MemAccess;
430 default: break;
431 }
432 return WriteBackMemAccess;
433}
434
435static
436CacheModelResult cachesim_D1_Read(Addr a, UChar size)
437{
438 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
439 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
440 case Hit: return L2_Hit;
441 case Miss: return MemAccess;
442 default: break;
443 }
444 return WriteBackMemAccess;
445}
446
447static
448CacheModelResult cachesim_D1_Write(Addr a, UChar size)
449{
450 if ( cachesim_ref( &D1, a, size) == Hit ) {
451 /* Even for a L1 hit, the write-trough L1 passes
452 * the write to the L2 to make the L2 line dirty.
453 * But this causes no latency, so return the hit.
454 */
455 cachesim_ref_wb( &L2, Write, a, size);
456 return L1_Hit;
457 }
458 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
459 case Hit: return L2_Hit;
460 case Miss: return MemAccess;
461 default: break;
462 }
463 return WriteBackMemAccess;
464}
465
466
467/*------------------------------------------------------------*/
468/*--- Hardware Prefetch Simulation ---*/
469/*------------------------------------------------------------*/
470
471static ULong prefetch_up = 0;
472static ULong prefetch_down = 0;
473
474#define PF_STREAMS 8
475#define PF_PAGEBITS 12
476
477static UInt pf_lastblock[PF_STREAMS];
478static Int pf_seqblocks[PF_STREAMS];
479
480static
481void prefetch_clear(void)
482{
483 int i;
484 for(i=0;i<PF_STREAMS;i++)
485 pf_lastblock[i] = pf_seqblocks[i] = 0;
486}
487
488/*
489 * HW Prefetch emulation
490 * Start prefetching when detecting sequential access to 3 memory blocks.
491 * One stream can be detected per 4k page.
492 */
493static __inline__
494void prefetch_L2_doref(Addr a, UChar size)
495{
496 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
497 UInt block = ( a >> L2.line_size_bits);
498
499 if (block != pf_lastblock[stream]) {
500 if (pf_seqblocks[stream] == 0) {
501 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
502 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
503 }
504 else if (pf_seqblocks[stream] >0) {
505 if (pf_lastblock[stream] +1 == block) {
506 pf_seqblocks[stream]++;
507 if (pf_seqblocks[stream] >= 2) {
508 prefetch_up++;
509 cachesim_ref(&L2, a + 5 * L2.line_size,1);
510 }
511 }
512 else pf_seqblocks[stream] = 0;
513 }
514 else if (pf_seqblocks[stream] <0) {
515 if (pf_lastblock[stream] -1 == block) {
516 pf_seqblocks[stream]--;
517 if (pf_seqblocks[stream] <= -2) {
518 prefetch_down++;
519 cachesim_ref(&L2, a - 5 * L2.line_size,1);
520 }
521 }
522 else pf_seqblocks[stream] = 0;
523 }
524 pf_lastblock[stream] = block;
525 }
526}
527
528/* simple model with hardware prefetch */
529
530static
531CacheModelResult prefetch_I1_ref(Addr a, UChar size)
532{
533 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
534 prefetch_L2_doref(a,size);
535 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
536 return MemAccess;
537}
538
539static
540CacheModelResult prefetch_D1_ref(Addr a, UChar size)
541{
542 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
543 prefetch_L2_doref(a,size);
544 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
545 return MemAccess;
546}
547
548
549/* complex model with hardware prefetch */
550
551static
552CacheModelResult prefetch_I1_Read(Addr a, UChar size)
553{
554 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
555 prefetch_L2_doref(a,size);
556 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
557 case Hit: return L2_Hit;
558 case Miss: return MemAccess;
559 default: break;
560 }
561 return WriteBackMemAccess;
562}
563
564static
565CacheModelResult prefetch_D1_Read(Addr a, UChar size)
566{
567 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
568 prefetch_L2_doref(a,size);
569 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
570 case Hit: return L2_Hit;
571 case Miss: return MemAccess;
572 default: break;
573 }
574 return WriteBackMemAccess;
575}
576
577static
578CacheModelResult prefetch_D1_Write(Addr a, UChar size)
579{
580 prefetch_L2_doref(a,size);
581 if ( cachesim_ref( &D1, a, size) == Hit ) {
582 /* Even for a L1 hit, the write-trough L1 passes
583 * the write to the L2 to make the L2 line dirty.
584 * But this causes no latency, so return the hit.
585 */
586 cachesim_ref_wb( &L2, Write, a, size);
587 return L1_Hit;
588 }
589 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
590 case Hit: return L2_Hit;
591 case Miss: return MemAccess;
592 default: break;
593 }
594 return WriteBackMemAccess;
595}
596
597
598/*------------------------------------------------------------*/
599/*--- Cache Simulation with use metric collection ---*/
600/*------------------------------------------------------------*/
601
602/* can not be combined with write-back or prefetch */
603
604static
605void cacheuse_initcache(cache_t2* c)
606{
607 int i;
608 unsigned int start_mask, start_val;
609 unsigned int end_mask, end_val;
610
611 c->use = CLG_MALLOC(sizeof(line_use) * c->sets * c->assoc);
612 c->loaded = CLG_MALLOC(sizeof(line_loaded) * c->sets * c->assoc);
613 c->line_start_mask = CLG_MALLOC(sizeof(int) * c->line_size);
614 c->line_end_mask = CLG_MALLOC(sizeof(int) * c->line_size);
615
616
617 c->line_size_mask = c->line_size-1;
618
619 /* Meaning of line_start_mask/line_end_mask
620 * Example: for a given cache line, you get an access starting at
621 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
622 * line size of 32, you have 1 bit per byte in the mask:
623 *
624 * bit31 bit8 bit5 bit 0
625 * | | | |
626 * 11..111111100000 line_start_mask[5]
627 * 00..000111111111 line_end_mask[(5+4)-1]
628 *
629 * use_mask |= line_start_mask[5] && line_end_mask[8]
630 *
631 */
632 start_val = end_val = ~0;
633 if (c->line_size < 32) {
634 int bits_per_byte = 32/c->line_size;
635 start_mask = (1<<bits_per_byte)-1;
636 end_mask = start_mask << (32-bits_per_byte);
637 for(i=0;i<c->line_size;i++) {
638 c->line_start_mask[i] = start_val;
639 start_val = start_val & ~start_mask;
640 start_mask = start_mask << bits_per_byte;
641
642 c->line_end_mask[c->line_size-i-1] = end_val;
643 end_val = end_val & ~end_mask;
644 end_mask = end_mask >> bits_per_byte;
645 }
646 }
647 else {
648 int bytes_per_bit = c->line_size/32;
649 start_mask = 1;
650 end_mask = 1 << 31;
651 for(i=0;i<c->line_size;i++) {
652 c->line_start_mask[i] = start_val;
653 c->line_end_mask[c->line_size-i-1] = end_val;
654 if ( ((i+1)%bytes_per_bit) == 0) {
655 start_val &= ~start_mask;
656 end_val &= ~end_mask;
657 start_mask <<= 1;
658 end_mask >>= 1;
659 }
660 }
661 }
662
663 CLG_DEBUG(6, "Config %s:\n", c->desc_line);
664 for(i=0;i<c->line_size;i++) {
665 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
666 i, c->line_start_mask[i], c->line_end_mask[i]);
667 }
668
669 /* We use lower tag bits as offset pointers to cache use info.
670 * I.e. some cache parameters don't work.
671 */
672 if (c->tag_shift < c->assoc_bits) {
673 VG_(message)(Vg_DebugMsg,
674 "error: Use associativity < %d for cache use statistics!",
675 (1<<c->tag_shift) );
676 VG_(tool_panic)("Unsupported cache configuration");
677 }
678}
679
680/* FIXME: A little tricky */
681#if 0
682
683static __inline__
684void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
685{
686 int idx = (high_idx << c->assoc_bits) | low_idx;
687
688 c->use[idx].count ++;
689 c->use[idx].mask |= use_mask;
690
691 CLG_DEBUG(6," Hit [idx %d] (line %p from %p): %x => %08x, count %d\n",
692 idx, c->loaded[idx].memline, c->loaded[idx].iaddr,
693 use_mask, c->use[idx].mask, c->use[idx].count);
694}
695
696/* only used for I1, D1 */
697
698static __inline__
699CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
700{
701 int i, j, idx;
702 UWord *set, tmp_tag;
703 UInt use_mask;
704
705 /* Shifting is a bit faster than multiplying */
706 set = &(c->tags[set_no << c->assoc_bits]);
707 use_mask =
708 c->line_start_mask[a & c->line_size_mask] &
709 c->line_end_mask[(a+size-1) & c->line_size_mask];
710
711 /* This loop is unrolled for just the first case, which is the most */
712 /* common. We can't unroll any further because it would screw up */
713 /* if we have a direct-mapped (1-way) cache. */
714 if (tag == (set[0] & c->tag_mask)) {
715 cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
716 return L1_Hit;
717 }
718
719 /* If the tag is one other than the MRU, move it into the MRU spot */
720 /* and shuffle the rest down. */
721 for (i = 1; i < c->assoc; i++) {
722 if (tag == (set[i] & c->tag_mask)) {
723 tmp_tag = set[i];
724 for (j = i; j > 0; j--) {
725 set[j] = set[j - 1];
726 }
727 set[0] = tmp_tag;
728
729 cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
730 return L1_Hit;
731 }
732 }
733
734 /* A miss; install this tag as MRU, shuffle rest down. */
735 tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
736 for (j = c->assoc - 1; j > 0; j--) {
737 set[j] = set[j - 1];
738 }
739 set[0] = tag | tmp_tag;
740
741 cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
742 use_mask, a & ~c->line_size_mask);
743
744 return Miss;
745}
746
747
748static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
749{
750 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
751 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
752 UWord tag = a >> c->tag_shift;
753
754 /* Access entirely within line. */
755 if (set1 == set2)
756 return cacheuse_setref(c, set1, tag);
757
758 /* Access straddles two lines. */
759 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
760 else if (((set1 + 1) & (c->sets-1)) == set2) {
761
762 /* the call updates cache structures as side effect */
763 CacheResult res1 = cacheuse_isMiss(c, set1, tag);
764 CacheResult res2 = cacheuse_isMiss(c, set2, tag);
765 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
766
767 } else {
768 VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
769 VG_(tool_panic)("item straddles more than two cache sets");
770 }
771 return Hit;
772}
773#endif
774
775
776/* for I1/D1 caches */
777#define CACHEUSE(L) \
778 \
779static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
780{ \
781 register UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
782 register UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
783 register UWord tag = a & L.tag_mask; \
784 int i, j, idx; \
785 UWord *set, tmp_tag; \
786 UInt use_mask; \
787 \
788 CLG_DEBUG(6,"%s.Acc(Addr %p, size %d): Sets [%d/%d]\n", \
789 L.name, a, size, set1, set2); \
790 \
791 /* First case: word entirely within line. */ \
792 if (set1 == set2) { \
793 \
794 /* Shifting is a bit faster than multiplying */ \
795 set = &(L.tags[set1 << L.assoc_bits]); \
796 use_mask = L.line_start_mask[a & L.line_size_mask] & \
797 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
798 \
799 /* This loop is unrolled for just the first case, which is the most */\
800 /* common. We can't unroll any further because it would screw up */\
801 /* if we have a direct-mapped (1-way) cache. */\
802 if (tag == (set[0] & L.tag_mask)) { \
803 idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
804 L.use[idx].count ++; \
805 L.use[idx].mask |= use_mask; \
806 CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
807 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
808 use_mask, L.use[idx].mask, L.use[idx].count); \
809 return L1_Hit; \
810 } \
811 /* If the tag is one other than the MRU, move it into the MRU spot */\
812 /* and shuffle the rest down. */\
813 for (i = 1; i < L.assoc; i++) { \
814 if (tag == (set[i] & L.tag_mask)) { \
815 tmp_tag = set[i]; \
816 for (j = i; j > 0; j--) { \
817 set[j] = set[j - 1]; \
818 } \
819 set[0] = tmp_tag; \
820 idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
821 L.use[idx].count ++; \
822 L.use[idx].mask |= use_mask; \
823 CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
824 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
825 use_mask, L.use[idx].mask, L.use[idx].count); \
826 return L1_Hit; \
827 } \
828 } \
829 \
830 /* A miss; install this tag as MRU, shuffle rest down. */ \
831 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
832 for (j = L.assoc - 1; j > 0; j--) { \
833 set[j] = set[j - 1]; \
834 } \
835 set[0] = tag | tmp_tag; \
836 idx = (set1 << L.assoc_bits) | tmp_tag; \
837 return update_##L##_use(&L, idx, \
838 use_mask, a &~ L.line_size_mask); \
839 \
840 /* Second case: word straddles two lines. */ \
841 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
842 } else if (((set1 + 1) & (L.sets-1)) == set2) { \
843 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
844 set = &(L.tags[set1 << L.assoc_bits]); \
845 use_mask = L.line_start_mask[a & L.line_size_mask]; \
846 if (tag == (set[0] & L.tag_mask)) { \
847 idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
848 L.use[idx].count ++; \
849 L.use[idx].mask |= use_mask; \
850 CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
851 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
852 use_mask, L.use[idx].mask, L.use[idx].count); \
853 goto block2; \
854 } \
855 for (i = 1; i < L.assoc; i++) { \
856 if (tag == (set[i] & L.tag_mask)) { \
857 tmp_tag = set[i]; \
858 for (j = i; j > 0; j--) { \
859 set[j] = set[j - 1]; \
860 } \
861 set[0] = tmp_tag; \
862 idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
863 L.use[idx].count ++; \
864 L.use[idx].mask |= use_mask; \
865 CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
866 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
867 use_mask, L.use[idx].mask, L.use[idx].count); \
868 goto block2; \
869 } \
870 } \
871 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
872 for (j = L.assoc - 1; j > 0; j--) { \
873 set[j] = set[j - 1]; \
874 } \
875 set[0] = tag | tmp_tag; \
876 idx = (set1 << L.assoc_bits) | tmp_tag; \
877 miss1 = update_##L##_use(&L, idx, \
878 use_mask, a &~ L.line_size_mask); \
879block2: \
880 set = &(L.tags[set2 << L.assoc_bits]); \
881 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
882 if (tag == (set[0] & L.tag_mask)) { \
883 idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
884 L.use[idx].count ++; \
885 L.use[idx].mask |= use_mask; \
886 CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
887 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
888 use_mask, L.use[idx].mask, L.use[idx].count); \
889 return miss1; \
890 } \
891 for (i = 1; i < L.assoc; i++) { \
892 if (tag == (set[i] & L.tag_mask)) { \
893 tmp_tag = set[i]; \
894 for (j = i; j > 0; j--) { \
895 set[j] = set[j - 1]; \
896 } \
897 set[0] = tmp_tag; \
898 idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
899 L.use[idx].count ++; \
900 L.use[idx].mask |= use_mask; \
901 CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
902 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
903 use_mask, L.use[idx].mask, L.use[idx].count); \
904 return miss1; \
905 } \
906 } \
907 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
908 for (j = L.assoc - 1; j > 0; j--) { \
909 set[j] = set[j - 1]; \
910 } \
911 set[0] = tag | tmp_tag; \
912 idx = (set2 << L.assoc_bits) | tmp_tag; \
913 miss2 = update_##L##_use(&L, idx, \
914 use_mask, (a+size-1) &~ L.line_size_mask); \
915 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
916 \
917 } else { \
918 VG_(printf)("addr: %p size: %u sets: %d %d", a, size, set1, set2); \
919 VG_(tool_panic)("item straddles more than two cache sets"); \
920 } \
921 return 0; \
922}
923
924
925/* logarithmic bitcounting algorithm, see
926 * http://graphics.stanford.edu/~seander/bithacks.html
927 */
928static __inline__ unsigned int countBits(unsigned int bits)
929{
930 unsigned int c; // store the total here
931 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
932 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
933
934 c = bits;
935 c = ((c >> S[0]) & B[0]) + (c & B[0]);
936 c = ((c >> S[1]) & B[1]) + (c & B[1]);
937 c = ((c >> S[2]) & B[2]) + (c & B[2]);
938 c = ((c >> S[3]) & B[3]) + (c & B[3]);
939 c = ((c >> S[4]) & B[4]) + (c & B[4]);
940 return c;
941}
942
943static void update_L2_use(int idx, Addr memline)
944{
945 line_loaded* loaded = &(L2.loaded[idx]);
946 line_use* use = &(L2.use[idx]);
947 int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
948
949 CLG_DEBUG(2, " L2.miss [%d]: at %p accessing memline %p\n",
950 idx, bb_base + current_ii->instr_offset, memline);
951 if (use->count>0) {
952 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %p from %p]\n",
953 use->count, i, use->mask, loaded->memline, loaded->iaddr);
954 CLG_DEBUG(2, " collect: %d, use_base %p\n",
955 CLG_(current_state).collect, loaded->use_base);
956
957 if (CLG_(current_state).collect && loaded->use_base) {
958 (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
959 (loaded->use_base)[off_L2_SpLoss] += i;
960 }
961 }
962
963 use->count = 0;
964 use->mask = 0;
965
966 loaded->memline = memline;
967 loaded->iaddr = bb_base + current_ii->instr_offset;
968 loaded->use_base = (CLG_(current_state).nonskipped) ?
969 CLG_(current_state).nonskipped->skipped :
970 cost_base + current_ii->cost_offset;
971}
972
973static
974CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
975{
976 UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
977 UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
978 UWord tag = memline & L2.tag_mask;
979
980 int i, j, idx;
981 UWord tmp_tag;
982
983 CLG_DEBUG(6,"L2.Acc(Memline %p): Set %d\n", memline, setNo);
984
985 if (tag == (set[0] & L2.tag_mask)) {
986 idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
987 l1_loaded->dep_use = &(L2.use[idx]);
988
989 CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): => %08x, count %d\n",
990 idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
991 L2.use[idx].mask, L2.use[idx].count);
992 return L2_Hit;
993 }
994 for (i = 1; i < L2.assoc; i++) {
995 if (tag == (set[i] & L2.tag_mask)) {
996 tmp_tag = set[i];
997 for (j = i; j > 0; j--) {
998 set[j] = set[j - 1];
999 }
1000 set[0] = tmp_tag;
1001 idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
1002 l1_loaded->dep_use = &(L2.use[idx]);
1003
1004 CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): => %08x, count %d\n",
1005 i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
1006 L2.use[idx].mask, L2.use[idx].count);
1007 return L2_Hit;
1008 }
1009 }
1010
1011 /* A miss; install this tag as MRU, shuffle rest down. */
1012 tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
1013 for (j = L2.assoc - 1; j > 0; j--) {
1014 set[j] = set[j - 1];
1015 }
1016 set[0] = tag | tmp_tag;
1017 idx = (setNo << L2.assoc_bits) | tmp_tag;
1018 l1_loaded->dep_use = &(L2.use[idx]);
1019
1020 update_L2_use(idx, memline);
1021
1022 return MemAccess;
1023}
1024
1025
1026
1027
1028#define UPDATE_USE(L) \
1029 \
1030static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
1031 UInt mask, Addr memline) \
1032{ \
1033 line_loaded* loaded = &(cache->loaded[idx]); \
1034 line_use* use = &(cache->use[idx]); \
1035 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
1036 \
1037 CLG_DEBUG(2, " %s.miss [%d]: at %p accessing memline %p (mask %08x)\n", \
1038 cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
1039 if (use->count>0) { \
1040 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %p from %p]\n",\
1041 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
1042 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
1043 CLG_(current_state).collect, loaded->use_base); \
1044 \
1045 if (CLG_(current_state).collect && loaded->use_base) { \
1046 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
1047 (loaded->use_base)[off_##L##_SpLoss] += c; \
1048 \
1049 /* FIXME (?): L1/L2 line sizes must be equal ! */ \
1050 loaded->dep_use->mask |= use->mask; \
1051 loaded->dep_use->count += use->count; \
1052 } \
1053 } \
1054 \
1055 use->count = 1; \
1056 use->mask = mask; \
1057 loaded->memline = memline; \
1058 loaded->iaddr = bb_base + current_ii->instr_offset; \
1059 loaded->use_base = (CLG_(current_state).nonskipped) ? \
1060 CLG_(current_state).nonskipped->skipped : \
1061 cost_base + current_ii->cost_offset; \
1062 \
1063 if (memline == 0) return L2_Hit; \
1064 return cacheuse_L2_access(memline, loaded); \
1065}
1066
1067UPDATE_USE(I1);
1068UPDATE_USE(D1);
1069
1070CACHEUSE(I1);
1071CACHEUSE(D1);
1072
1073
1074static
1075void cacheuse_finish(void)
1076{
1077 int i;
1078 InstrInfo ii = { 0,0,0,0,0 };
1079
1080 if (!CLG_(current_state).collect) return;
1081
1082 bb_base = 0;
1083 current_ii = &ii;
1084 cost_base = 0;
1085
1086 /* update usage counters */
1087 if (I1.use)
1088 for (i = 0; i < I1.sets * I1.assoc; i++)
1089 if (I1.loaded[i].use_base)
1090 update_I1_use( &I1, i, 0,0);
1091
1092 if (D1.use)
1093 for (i = 0; i < D1.sets * D1.assoc; i++)
1094 if (D1.loaded[i].use_base)
1095 update_D1_use( &D1, i, 0,0);
1096
1097 if (L2.use)
1098 for (i = 0; i < L2.sets * L2.assoc; i++)
1099 if (L2.loaded[i].use_base)
1100 update_L2_use(i, 0);
1101}
1102
1103
1104
1105/*------------------------------------------------------------*/
1106/*--- Helper functions called by instrumented code ---*/
1107/*------------------------------------------------------------*/
1108
1109
1110static __inline__
1111void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1112{
1113 switch(r) {
1114 case WriteBackMemAccess:
1115 if (clo_simulate_writeback) {
1116 c1[3]++;
1117 c2[3]++;
1118 }
1119 // fall through
1120
1121 case MemAccess:
1122 c1[2]++;
1123 c2[2]++;
1124 // fall through
1125
1126 case L2_Hit:
1127 c1[1]++;
1128 c2[1]++;
1129 // fall through
1130
1131 default:
1132 c1[0]++;
1133 c2[0]++;
1134 }
1135}
1136
1137
1138VG_REGPARM(1)
1139static void log_1I0D(InstrInfo* ii)
1140{
1141 CacheModelResult IrRes;
1142
1143 current_ii = ii;
1144 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1145
1146 CLG_DEBUG(6, "log_1I0D: Ir=%p/%u => Ir %d\n",
1147 bb_base + ii->instr_offset, ii->instr_size, IrRes);
1148
1149 if (CLG_(current_state).collect) {
1150 ULong* cost_Ir;
1151
1152 if (CLG_(current_state).nonskipped)
1153 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1154 else
1155 cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
1156
1157 inc_costs(IrRes, cost_Ir,
1158 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1159 }
1160}
1161
1162
1163/* Instruction doing a read access */
1164
1165VG_REGPARM(2)
1166static void log_1I1Dr(InstrInfo* ii, Addr data)
1167{
1168 CacheModelResult IrRes, DrRes;
1169
1170 current_ii = ii;
1171 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1172 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1173
1174 CLG_DEBUG(6, "log_1I1Dr: Ir=%p/%u, Dr=%p/%u => Ir %d, Dr %d\n",
1175 bb_base + ii->instr_offset, ii->instr_size,
1176 data, ii->data_size, IrRes, DrRes);
1177
1178 if (CLG_(current_state).collect) {
1179 ULong *cost_Ir, *cost_Dr;
1180
1181 if (CLG_(current_state).nonskipped) {
1182 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1183 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1184 }
1185 else {
1186 cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
1187 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1188 }
1189
1190 inc_costs(IrRes, cost_Ir,
1191 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1192 inc_costs(DrRes, cost_Dr,
1193 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1194 }
1195}
1196
1197
1198VG_REGPARM(2)
1199static void log_0I1Dr(InstrInfo* ii, Addr data)
1200{
1201 CacheModelResult DrRes;
1202
1203 current_ii = ii;
1204 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1205
1206 CLG_DEBUG(6, "log_0I1Dr: Dr=%p/%u => Dr %d\n",
1207 data, ii->data_size, DrRes);
1208
1209 if (CLG_(current_state).collect) {
1210 ULong *cost_Dr;
1211
1212 if (CLG_(current_state).nonskipped) {
1213 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1214 }
1215 else {
1216 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1217 }
1218
1219 inc_costs(DrRes, cost_Dr,
1220 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1221 }
1222}
1223
1224
1225/* Instruction doing a write access */
1226
1227VG_REGPARM(2)
1228static void log_1I1Dw(InstrInfo* ii, Addr data)
1229{
1230 CacheModelResult IrRes, DwRes;
1231
1232 current_ii = ii;
1233 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1234 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1235
1236 CLG_DEBUG(6, "log_1I1Dw: Ir=%p/%u, Dw=%p/%u => Ir %d, Dw %d\n",
1237 bb_base + ii->instr_offset, ii->instr_size,
1238 data, ii->data_size, IrRes, DwRes);
1239
1240 if (CLG_(current_state).collect) {
1241 ULong *cost_Ir, *cost_Dw;
1242
1243 if (CLG_(current_state).nonskipped) {
1244 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1245 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1246 }
1247 else {
1248 cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
1249 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1250 }
1251
1252 inc_costs(IrRes, cost_Ir,
1253 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1254 inc_costs(DwRes, cost_Dw,
1255 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1256 }
1257}
1258
1259VG_REGPARM(2)
1260static void log_0I1Dw(InstrInfo* ii, Addr data)
1261{
1262 CacheModelResult DwRes;
1263
1264 current_ii = ii;
1265 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1266
1267 CLG_DEBUG(6, "log_0I1Dw: Dw=%p/%u => Dw %d\n",
1268 data, ii->data_size, DwRes);
1269
1270 if (CLG_(current_state).collect) {
1271 ULong *cost_Dw;
1272
1273 if (CLG_(current_state).nonskipped) {
1274 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1275 }
1276 else {
1277 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1278 }
1279
1280 inc_costs(DwRes, cost_Dw,
1281 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1282 }
1283}
1284
1285/* Instruction doing a read and a write access */
1286
1287VG_REGPARM(3)
1288static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
1289{
1290 CacheModelResult IrRes, DrRes, DwRes;
1291
1292 current_ii = ii;
1293 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1294 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1295 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1296
1297 CLG_DEBUG(6,
1298 "log_1I2D: Ir=%p/%u, Dr=%p/%u, Dw=%p/%u => Ir %d, Dr %d, Dw %d\n",
1299 bb_base + ii->instr_offset, ii->instr_size,
1300 data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
1301
1302 if (CLG_(current_state).collect) {
1303 ULong *cost_Ir, *cost_Dr, *cost_Dw;
1304
1305 if (CLG_(current_state).nonskipped) {
1306 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1307 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1308 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1309 }
1310 else {
1311 cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
1312 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1313 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1314 }
1315
1316 inc_costs(IrRes, cost_Ir,
1317 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1318 inc_costs(DrRes, cost_Dr,
1319 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1320 inc_costs(DwRes, cost_Dw,
1321 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1322 }
1323}
1324
1325VG_REGPARM(3)
1326static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
1327{
1328 CacheModelResult DrRes, DwRes;
1329
1330 current_ii = ii;
1331 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1332 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1333
1334 CLG_DEBUG(6,
1335 "log_0D2D: Dr=%p/%u, Dw=%p/%u => Dr %d, Dw %d\n",
1336 data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
1337
1338 if (CLG_(current_state).collect) {
1339 ULong *cost_Dr, *cost_Dw;
1340
1341 if (CLG_(current_state).nonskipped) {
1342 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1343 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1344 }
1345 else {
1346 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1347 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1348 }
1349
1350 inc_costs(DrRes, cost_Dr,
1351 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1352 inc_costs(DwRes, cost_Dw,
1353 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1354 }
1355}
1356
1357
1358/*------------------------------------------------------------*/
1359/*--- Cache configuration ---*/
1360/*------------------------------------------------------------*/
1361
1362#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
1363
1364static cache_t clo_I1_cache = UNDEFINED_CACHE;
1365static cache_t clo_D1_cache = UNDEFINED_CACHE;
1366static cache_t clo_L2_cache = UNDEFINED_CACHE;
1367
1368
1369/* Checks cache config is ok; makes it so if not. */
1370static
1371void check_cache(cache_t* cache, Char *name)
1372{
1373 /* First check they're all powers of two */
1374 if (-1 == VG_(log2)(cache->size)) {
1375 VG_(message)(Vg_UserMsg,
1376 "error: %s size of %dB not a power of two; aborting.",
1377 name, cache->size);
1378 VG_(exit)(1);
1379 }
1380
1381 if (-1 == VG_(log2)(cache->assoc)) {
1382 VG_(message)(Vg_UserMsg,
1383 "error: %s associativity of %d not a power of two; aborting.",
1384 name, cache->assoc);
1385 VG_(exit)(1);
1386 }
1387
1388 if (-1 == VG_(log2)(cache->line_size)) {
1389 VG_(message)(Vg_UserMsg,
1390 "error: %s line size of %dB not a power of two; aborting.",
1391 name, cache->line_size);
1392 VG_(exit)(1);
1393 }
1394
1395 // Then check line size >= 16 -- any smaller and a single instruction could
1396 // straddle three cache lines, which breaks a simulation assertion and is
1397 // stupid anyway.
1398 if (cache->line_size < MIN_LINE_SIZE) {
1399 VG_(message)(Vg_UserMsg,
1400 "error: %s line size of %dB too small; aborting.",
1401 name, cache->line_size);
1402 VG_(exit)(1);
1403 }
1404
1405 /* Then check cache size > line size (causes seg faults if not). */
1406 if (cache->size <= cache->line_size) {
1407 VG_(message)(Vg_UserMsg,
1408 "error: %s cache size of %dB <= line size of %dB; aborting.",
1409 name, cache->size, cache->line_size);
1410 VG_(exit)(1);
1411 }
1412
1413 /* Then check assoc <= (size / line size) (seg faults otherwise). */
1414 if (cache->assoc > (cache->size / cache->line_size)) {
1415 VG_(message)(Vg_UserMsg,
1416 "warning: %s associativity > (size / line size); aborting.", name);
1417 VG_(exit)(1);
1418 }
1419}
1420
1421static
1422void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1423{
1424#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
1425
1426 Int n_clos = 0;
1427
1428 // Count how many were defined on the command line.
1429 if (DEFINED(clo_I1_cache)) { n_clos++; }
1430 if (DEFINED(clo_D1_cache)) { n_clos++; }
1431 if (DEFINED(clo_L2_cache)) { n_clos++; }
1432
1433 // Set the cache config (using auto-detection, if supported by the
1434 // architecture)
1435 VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1436
1437 // Then replace with any defined on the command line.
1438 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1439 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1440 if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1441
1442 // Then check values and fix if not acceptable.
1443 check_cache(I1c, "I1");
1444 check_cache(D1c, "D1");
1445 check_cache(L2c, "L2");
1446
1447 if (VG_(clo_verbosity) > 1) {
1448 VG_(message)(Vg_UserMsg, "Cache configuration used:");
1449 VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
1450 I1c->size, I1c->assoc, I1c->line_size);
1451 VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
1452 D1c->size, D1c->assoc, D1c->line_size);
1453 VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
1454 L2c->size, L2c->assoc, L2c->line_size);
1455 }
1456#undef CMD_LINE_DEFINED
1457}
1458
1459
1460/* Initialize and clear simulator state */
1461static void cachesim_post_clo_init(void)
1462{
1463 /* Cache configurations. */
1464 cache_t I1c, D1c, L2c;
1465
1466 /* Initialize access handlers */
1467 if (!CLG_(clo).simulate_cache) {
1468 CLG_(cachesim).log_1I0D = 0;
1469 CLG_(cachesim).log_1I0D_name = "(no function)";
1470
1471 CLG_(cachesim).log_1I1Dr = 0;
1472 CLG_(cachesim).log_1I1Dw = 0;
1473 CLG_(cachesim).log_1I2D = 0;
1474 CLG_(cachesim).log_1I1Dr_name = "(no function)";
1475 CLG_(cachesim).log_1I1Dw_name = "(no function)";
1476 CLG_(cachesim).log_1I2D_name = "(no function)";
1477
1478 CLG_(cachesim).log_0I1Dr = 0;
1479 CLG_(cachesim).log_0I1Dw = 0;
1480 CLG_(cachesim).log_0I2D = 0;
1481 CLG_(cachesim).log_0I1Dr_name = "(no function)";
1482 CLG_(cachesim).log_0I1Dw_name = "(no function)";
1483 CLG_(cachesim).log_0I2D_name = "(no function)";
1484 return;
1485 }
1486
1487 /* Configuration of caches only needed with real cache simulation */
1488 configure_caches(&I1c, &D1c, &L2c);
1489
1490 I1.name = "I1";
1491 D1.name = "D1";
1492 L2.name = "L2";
1493
1494 cachesim_initcache(I1c, &I1);
1495 cachesim_initcache(D1c, &D1);
1496 cachesim_initcache(L2c, &L2);
1497
1498 /* the other cache simulators use the standard helpers
1499 * with dispatching via simulator struct */
1500
1501 CLG_(cachesim).log_1I0D = log_1I0D;
1502 CLG_(cachesim).log_1I0D_name = "log_1I0D";
1503
1504 CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1505 CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1506 CLG_(cachesim).log_1I2D = log_1I2D;
1507 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1508 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1509 CLG_(cachesim).log_1I2D_name = "log_1I2D";
1510
1511 CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1512 CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1513 CLG_(cachesim).log_0I2D = log_0I2D;
1514 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1515 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1516 CLG_(cachesim).log_0I2D_name = "log_0I2D";
1517
1518 if (clo_collect_cacheuse) {
1519
1520 /* Output warning for not supported option combinations */
1521 if (clo_simulate_hwpref) {
1522 VG_(message)(Vg_DebugMsg,
1523 "warning: prefetch simulation can not be used with cache usage");
1524 clo_simulate_hwpref = False;
1525 }
1526
1527 if (clo_simulate_writeback) {
1528 VG_(message)(Vg_DebugMsg,
1529 "warning: write-back simulation can not be used with cache usage");
1530 clo_simulate_writeback = False;
1531 }
1532
1533 simulator.I1_Read = cacheuse_I1_doRead;
1534 simulator.D1_Read = cacheuse_D1_doRead;
1535 simulator.D1_Write = cacheuse_D1_doRead;
1536 return;
1537 }
1538
1539 if (clo_simulate_hwpref) {
1540 prefetch_clear();
1541
1542 if (clo_simulate_writeback) {
1543 simulator.I1_Read = prefetch_I1_Read;
1544 simulator.D1_Read = prefetch_D1_Read;
1545 simulator.D1_Write = prefetch_D1_Write;
1546 }
1547 else {
1548 simulator.I1_Read = prefetch_I1_ref;
1549 simulator.D1_Read = prefetch_D1_ref;
1550 simulator.D1_Write = prefetch_D1_ref;
1551 }
1552
1553 return;
1554 }
1555
1556 if (clo_simulate_writeback) {
1557 simulator.I1_Read = cachesim_I1_Read;
1558 simulator.D1_Read = cachesim_D1_Read;
1559 simulator.D1_Write = cachesim_D1_Write;
1560 }
1561 else {
1562 simulator.I1_Read = cachesim_I1_ref;
1563 simulator.D1_Read = cachesim_D1_ref;
1564 simulator.D1_Write = cachesim_D1_ref;
1565 }
1566}
1567
1568
1569/* Clear simulator state. Has to be initialized before */
1570static
1571void cachesim_clear(void)
1572{
1573 cachesim_clearcache(&I1);
1574 cachesim_clearcache(&D1);
1575 cachesim_clearcache(&L2);
1576
1577 prefetch_clear();
1578}
1579
1580
1581static void cachesim_getdesc(Char* buf)
1582{
1583 Int p;
1584 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1585 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1586 VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1587}
1588
1589static
1590void cachesim_print_opts(void)
1591{
1592 VG_(printf)(
1593"\n cache simulator options:\n"
1594" --simulate-cache=no|yes Do cache simulation [no]\n"
1595" --simulate-wb=no|yes Count write-back events [no]\n"
1596" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1597#if CLG_EXPERIMENTAL
1598" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1599#endif
1600" --cacheuse=no|yes Collect cache block use [no]\n"
1601" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
1602" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
1603" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
1604 );
1605}
1606
1607static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
1608{
1609 int i1, i2, i3;
1610 int i;
1611 char *opt = VG_(strdup)(orig_opt);
1612
1613 i = i1 = opt_len;
1614
1615 /* Option looks like "--I1=65536,2,64".
1616 * Find commas, replace with NULs to make three independent
1617 * strings, then extract numbers. Yuck. */
1618 while (VG_(isdigit)(opt[i])) i++;
1619 if (',' == opt[i]) {
1620 opt[i++] = '\0';
1621 i2 = i;
1622 } else goto bad;
1623 while (VG_(isdigit)(opt[i])) i++;
1624 if (',' == opt[i]) {
1625 opt[i++] = '\0';
1626 i3 = i;
1627 } else goto bad;
1628 while (VG_(isdigit)(opt[i])) i++;
1629 if ('\0' != opt[i]) goto bad;
1630
1631 cache->size = (Int)VG_(atoll)(opt + i1);
1632 cache->assoc = (Int)VG_(atoll)(opt + i2);
1633 cache->line_size = (Int)VG_(atoll)(opt + i3);
1634
1635 VG_(free)(opt);
1636
1637 return;
1638
1639 bad:
1640 VG_(bad_option)(orig_opt);
1641}
1642
1643/* Check for command line option for cache configuration.
1644 * Return False if unknown and not handled.
1645 *
1646 * Called from CLG_(process_cmd_line_option)() in clo.c
1647 */
1648static Bool cachesim_parse_opt(Char* arg)
1649{
1650 if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
1651 clo_simulate_writeback = True;
1652 else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
1653 clo_simulate_writeback = False;
1654
1655 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
1656 clo_simulate_hwpref = True;
1657 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
1658 clo_simulate_hwpref = False;
1659
1660 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
1661 clo_simulate_sectors = True;
1662 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
1663 clo_simulate_sectors = False;
1664
1665 else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
1666 clo_collect_cacheuse = True;
1667 /* Use counters only make sense with fine dumping */
1668 CLG_(clo).dump_instr = True;
1669 }
1670 else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
1671 clo_collect_cacheuse = False;
1672
1673 /* 5 is length of "--I1=" */
1674 else if (0 == VG_(strncmp)(arg, "--I1=", 5))
1675 parse_opt(&clo_I1_cache, arg, 5);
1676 else if (0 == VG_(strncmp)(arg, "--D1=", 5))
1677 parse_opt(&clo_D1_cache, arg, 5);
1678 else if (0 == VG_(strncmp)(arg, "--L2=", 5))
1679 parse_opt(&clo_L2_cache, arg, 5);
1680 else
1681 return False;
1682
1683 return True;
1684}
1685
1686/* Adds commas to ULong, right justifying in a field field_width wide, returns
1687 * the string in buf. */
1688static
1689Int commify(ULong n, int field_width, char* buf)
1690{
1691 int len, n_commas, i, j, new_len, space;
1692
1693 VG_(sprintf)(buf, "%llu", n);
1694 len = VG_(strlen)(buf);
1695 n_commas = (len - 1) / 3;
1696 new_len = len + n_commas;
1697 space = field_width - new_len;
1698
1699 /* Allow for printing a number in a field_width smaller than it's size */
1700 if (space < 0) space = 0;
1701
1702 /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1703 * of three. */
1704 for (j = -1, i = len ; i >= 0; i--) {
1705 buf[i + n_commas + space] = buf[i];
1706
1707 if ((i>0) && (3 == ++j)) {
1708 j = 0;
1709 n_commas--;
1710 buf[i + n_commas + space] = ',';
1711 }
1712 }
1713 /* Right justify in field. */
1714 for (i = 0; i < space; i++) buf[i] = ' ';
1715 return new_len;
1716}
1717
1718static
1719void percentify(Int n, Int ex, Int field_width, char buf[])
1720{
1721 int i, len, space;
1722
1723 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1724 len = VG_(strlen)(buf);
1725 space = field_width - len;
1726 if (space < 0) space = 0; /* Allow for v. small field_width */
1727 i = len;
1728
1729 /* Right justify in field */
1730 for ( ; i >= 0; i--) buf[i + space] = buf[i];
1731 for (i = 0; i < space; i++) buf[i] = ' ';
1732}
1733
1734static
1735void cachesim_printstat(void)
1736{
1737 FullCost total = CLG_(total_cost), D_total = 0;
1738 ULong L2_total_m, L2_total_mr, L2_total_mw,
1739 L2_total, L2_total_r, L2_total_w;
1740 char buf1[RESULTS_BUF_LEN],
1741 buf2[RESULTS_BUF_LEN],
1742 buf3[RESULTS_BUF_LEN];
1743 Int l1, l2, l3;
1744 Int p;
1745
1746 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1747 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
1748 prefetch_up);
1749 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
1750 prefetch_down);
1751 VG_(message)(Vg_DebugMsg, "");
1752 }
1753
1754 /* I cache results. Use the I_refs value to determine the first column
1755 * width. */
1756 l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1757 VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
1758
1759 if (!CLG_(clo).simulate_cache) return;
1760
1761 commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1762 VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
1763
1764 commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1765 VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
1766
1767 p = 100;
1768
1769 if (0 == total[CLG_(sets).off_full_Ir])
1770 total[CLG_(sets).off_full_Ir] = 1;
1771
1772 percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1773 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1774 VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
1775
1776 percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1777 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1778 VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
1779 VG_(message)(Vg_UserMsg, "");
1780
1781 /* D cache results.
1782 Use the D_refs.rd and D_refs.wr values to determine the
1783 * width of columns 2 & 3. */
1784
1785 D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1786 CLG_(init_cost)( CLG_(sets).full, D_total);
1787 CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1788 CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1789
1790 commify( D_total[0], l1, buf1);
1791 l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
1792 l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
1793 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
1794 buf1, buf2, buf3);
1795
1796 commify( D_total[1], l1, buf1);
1797 commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1798 commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1799 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
1800 buf1, buf2, buf3);
1801
1802 commify( D_total[2], l1, buf1);
1803 commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1804 commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1805 VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
1806 buf1, buf2, buf3);
1807
1808 p = 10;
1809
1810 if (0 == D_total[0]) D_total[0] = 1;
1811 if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1812 if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1813
1814 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
1815 percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1816 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1817 percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1818 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1819 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
1820
1821 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
1822 percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1823 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1824 percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1825 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1826 VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
1827 VG_(message)(Vg_UserMsg, "");
1828
1829
1830
1831 /* L2 overall results */
1832
1833 L2_total =
1834 total[CLG_(sets).off_full_Dr +1] +
1835 total[CLG_(sets).off_full_Dw +1] +
1836 total[CLG_(sets).off_full_Ir +1];
1837 L2_total_r =
1838 total[CLG_(sets).off_full_Dr +1] +
1839 total[CLG_(sets).off_full_Ir +1];
1840 L2_total_w = total[CLG_(sets).off_full_Dw +1];
1841 commify(L2_total, l1, buf1);
1842 commify(L2_total_r, l2, buf2);
1843 commify(L2_total_w, l3, buf3);
1844 VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
1845 buf1, buf2, buf3);
1846
1847 L2_total_m =
1848 total[CLG_(sets).off_full_Dr +2] +
1849 total[CLG_(sets).off_full_Dw +2] +
1850 total[CLG_(sets).off_full_Ir +2];
1851 L2_total_mr =
1852 total[CLG_(sets).off_full_Dr +2] +
1853 total[CLG_(sets).off_full_Ir +2];
1854 L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1855 commify(L2_total_m, l1, buf1);
1856 commify(L2_total_mr, l2, buf2);
1857 commify(L2_total_mw, l3, buf3);
1858 VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
1859 buf1, buf2, buf3);
1860
1861 percentify(L2_total_m * 100 * p /
1862 (total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
1863 percentify(L2_total_mr * 100 * p /
1864 (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1865 p, l2+1, buf2);
1866 percentify(L2_total_mw * 100 * p /
1867 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1868 VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
1869 buf1, buf2,buf3);
1870}
1871
1872
1873/*------------------------------------------------------------*/
1874/*--- Setup for Event set. ---*/
1875/*------------------------------------------------------------*/
1876
1877struct event_sets CLG_(sets);
1878
1879void CLG_(init_eventsets)(Int max_user)
1880{
1881 EventType * e1, *e2, *e3, *e4;
1882 EventSet *Ir, *Dr, *Dw;
1883 EventSet *D0, *D1r, *D1w, *D2;
1884 EventSet *sim, *full;
1885 EventSet *use;
1886 int sizeOfUseIr;
1887
1888 use = CLG_(get_eventset)("Use", 4);
1889 if (clo_collect_cacheuse) {
1890 /* if TUse is 0, there was never a load, and no loss, too */
1891 e1 = CLG_(register_eventtype)("AcCost1");
1892 CLG_(add_eventtype)(use, e1);
1893 e1 = CLG_(register_eventtype)("SpLoss1");
1894 CLG_(add_eventtype)(use, e1);
1895 e1 = CLG_(register_eventtype)("AcCost2");
1896 CLG_(add_eventtype)(use, e1);
1897 e1 = CLG_(register_eventtype)("SpLoss2");
1898 CLG_(add_eventtype)(use, e1);
1899 }
1900
1901 Ir = CLG_(get_eventset)("Ir", 4);
1902 Dr = CLG_(get_eventset)("Dr", 4);
1903 Dw = CLG_(get_eventset)("Dw", 4);
1904 if (CLG_(clo).simulate_cache) {
1905 e1 = CLG_(register_eventtype)("Ir");
1906 e2 = CLG_(register_eventtype)("I1mr");
1907 e3 = CLG_(register_eventtype)("I2mr");
1908 if (clo_simulate_writeback) {
1909 e4 = CLG_(register_eventtype)("I2dmr");
1910 CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1911 }
1912 else
1913 CLG_(add_dep_event3)(Ir, e1,e2,e3);
1914
1915 e1 = CLG_(register_eventtype)("Dr");
1916 e2 = CLG_(register_eventtype)("D1mr");
1917 e3 = CLG_(register_eventtype)("D2mr");
1918 if (clo_simulate_writeback) {
1919 e4 = CLG_(register_eventtype)("D2dmr");
1920 CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1921 }
1922 else
1923 CLG_(add_dep_event3)(Dr, e1,e2,e3);
1924
1925 e1 = CLG_(register_eventtype)("Dw");
1926 e2 = CLG_(register_eventtype)("D1mw");
1927 e3 = CLG_(register_eventtype)("D2mw");
1928 if (clo_simulate_writeback) {
1929 e4 = CLG_(register_eventtype)("D2dmw");
1930 CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1931 }
1932 else
1933 CLG_(add_dep_event3)(Dw, e1,e2,e3);
1934
1935 }
1936 else {
1937 e1 = CLG_(register_eventtype)("Ir");
1938 CLG_(add_eventtype)(Ir, e1);
1939 }
1940
1941 sizeOfUseIr = use->size + Ir->size;
1942 D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
1943 CLG_(add_eventset)(D0, use);
1944 off_D0_Ir = CLG_(add_eventset)(D0, Ir);
1945
1946 D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
1947 CLG_(add_eventset)(D1r, use);
1948 off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
1949 off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
1950
1951 D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
1952 CLG_(add_eventset)(D1w, use);
1953 off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
1954 off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
1955
1956 D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
1957 CLG_(add_eventset)(D2, use);
1958 off_D2_Ir = CLG_(add_eventset)(D2, Ir);
1959 off_D2_Dr = CLG_(add_eventset)(D2, Dr);
1960 off_D2_Dw = CLG_(add_eventset)(D2, Dw);
1961
1962 sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
1963 CLG_(add_eventset)(sim, use);
1964 CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
1965 CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
1966 CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
1967
1968 if (CLG_(clo).collect_alloc) max_user += 2;
1969 if (CLG_(clo).collect_systime) max_user += 2;
1970
1971 full = CLG_(get_eventset)("full", sim->size + max_user);
1972 CLG_(add_eventset)(full, sim);
1973 CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
1974 CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
1975 CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
1976
1977 CLG_(sets).use = use;
1978 CLG_(sets).Ir = Ir;
1979 CLG_(sets).Dr = Dr;
1980 CLG_(sets).Dw = Dw;
1981
1982 CLG_(sets).D0 = D0;
1983 CLG_(sets).D1r = D1r;
1984 CLG_(sets).D1w = D1w;
1985 CLG_(sets).D2 = D2;
1986
1987 CLG_(sets).sim = sim;
1988 CLG_(sets).full = full;
1989
1990 if (CLG_(clo).collect_alloc) {
1991 e1 = CLG_(register_eventtype)("allocCount");
1992 e2 = CLG_(register_eventtype)("allocSize");
1993 CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
1994 }
1995
1996 if (CLG_(clo).collect_systime) {
1997 e1 = CLG_(register_eventtype)("sysCount");
1998 e2 = CLG_(register_eventtype)("sysTime");
1999 CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
2000 }
2001
2002 CLG_DEBUGIF(1) {
2003 CLG_DEBUG(1, "EventSets:\n");
2004 CLG_(print_eventset)(-2, use);
2005 CLG_(print_eventset)(-2, Ir);
2006 CLG_(print_eventset)(-2, Dr);
2007 CLG_(print_eventset)(-2, Dw);
2008 CLG_(print_eventset)(-2, sim);
2009 CLG_(print_eventset)(-2, full);
2010 }
2011
2012 /* Not-existing events are silently ignored */
2013 CLG_(dumpmap) = CLG_(get_eventmapping)(full);
2014 CLG_(append_event)(CLG_(dumpmap), "Ir");
2015 CLG_(append_event)(CLG_(dumpmap), "Dr");
2016 CLG_(append_event)(CLG_(dumpmap), "Dw");
2017 CLG_(append_event)(CLG_(dumpmap), "I1mr");
2018 CLG_(append_event)(CLG_(dumpmap), "D1mr");
2019 CLG_(append_event)(CLG_(dumpmap), "D1mw");
2020 CLG_(append_event)(CLG_(dumpmap), "I2mr");
2021 CLG_(append_event)(CLG_(dumpmap), "D2mr");
2022 CLG_(append_event)(CLG_(dumpmap), "D2mw");
2023 CLG_(append_event)(CLG_(dumpmap), "I2dmr");
2024 CLG_(append_event)(CLG_(dumpmap), "D2dmr");
2025 CLG_(append_event)(CLG_(dumpmap), "D2dmw");
2026 CLG_(append_event)(CLG_(dumpmap), "AcCost1");
2027 CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
2028 CLG_(append_event)(CLG_(dumpmap), "AcCost2");
2029 CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
2030 CLG_(append_event)(CLG_(dumpmap), "allocCount");
2031 CLG_(append_event)(CLG_(dumpmap), "allocSize");
2032 CLG_(append_event)(CLG_(dumpmap), "sysCount");
2033 CLG_(append_event)(CLG_(dumpmap), "sysTime");
2034
2035}
2036
2037
2038
2039static
2040void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
2041{
2042 /* if eventset use is defined, it is always first (hardcoded!) */
2043 CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
2044
2045 /* FIXME: This is hardcoded... */
2046 if (es == CLG_(sets).D0) {
2047 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2048 cost + off_D0_Ir);
2049 }
2050 else if (es == CLG_(sets).D1r) {
2051 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2052 cost + off_D1r_Ir);
2053 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2054 cost + off_D1r_Dr);
2055 }
2056 else if (es == CLG_(sets).D1w) {
2057 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2058 cost + off_D1w_Ir);
2059 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2060 cost + off_D1w_Dw);
2061 }
2062 else {
2063 CLG_ASSERT(es == CLG_(sets).D2);
2064 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2065 cost + off_D2_Ir);
2066 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2067 cost + off_D2_Dr);
2068 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2069 cost + off_D2_Dw);
2070 }
2071}
2072
2073/* this is called at dump time for every instruction executed */
2074static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
2075 InstrInfo* ii, ULong exe_count)
2076{
2077 if (!CLG_(clo).simulate_cache)
2078 cost[CLG_(sets).off_sim_Ir] += exe_count;
2079 else {
2080
2081#if 0
2082/* There is always a trivial case where exe_count and Ir can be
2083 * slightly different because ecounter is updated when executing
2084 * the next BB. E.g. for last BB executed, or when toggling collection
2085 */
2086 /* FIXME: Hardcoded that each eventset has Ir as first */
2087 if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
2088 VG_(printf)("==> Ir %llu, exe %llu\n",
2089 (bbcc->cost + ii->cost_offset)[0], exe_count);
2090 CLG_(print_bbcc_cost)(-2, bbcc);
2091 //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
2092 }
2093#endif
2094
2095 add_and_zero_Dx(ii->eventset, cost,
2096 bbcc->cost + ii->cost_offset);
2097 }
2098}
2099
2100static
2101void cachesim_after_bbsetup(void)
2102{
2103 BBCC* bbcc = CLG_(current_state).bbcc;
2104
2105 if (CLG_(clo).simulate_cache) {
2106 BB* bb = bbcc->bb;
2107
2108 /* only needed if log_* functions are called */
2109 bb_base = bb->obj->offset + bb->offset;
2110 cost_base = bbcc->cost;
2111 }
2112}
2113
2114static
2115void cachesim_finish(void)
2116{
2117 if (clo_collect_cacheuse)
2118 cacheuse_finish();
2119}
2120
2121/*------------------------------------------------------------*/
2122/*--- The simulator defined in this file ---*/
2123/*------------------------------------------------------------*/
2124
2125struct cachesim_if CLG_(cachesim) = {
2126 .print_opts = cachesim_print_opts,
2127 .parse_opt = cachesim_parse_opt,
2128 .post_clo_init = cachesim_post_clo_init,
2129 .clear = cachesim_clear,
2130 .getdesc = cachesim_getdesc,
2131 .printstat = cachesim_printstat,
2132 .add_icost = cachesim_add_icost,
2133 .after_bbsetup = cachesim_after_bbsetup,
2134 .finish = cachesim_finish,
2135
2136 /* these will be set by cachesim_post_clo_init */
2137 .log_1I0D = 0,
2138
2139 .log_1I1Dr = 0,
2140 .log_1I1Dw = 0,
2141 .log_1I2D = 0,
2142
2143 .log_0I1Dr = 0,
2144 .log_0I1Dw = 0,
2145 .log_0I2D = 0,
2146
2147 .log_1I0D_name = "(no function)",
2148
2149 .log_1I1Dr_name = "(no function)",
2150 .log_1I1Dw_name = "(no function)",
2151 .log_1I2D_name = "(no function)",
2152
2153 .log_0I1Dr_name = "(no function)",
2154 .log_0I1Dw_name = "(no function)",
2155 .log_0I2D_name = "(no function)"
2156};
2157
2158
2159/*--------------------------------------------------------------------*/
2160/*--- end ct_sim.c ---*/
2161/*--------------------------------------------------------------------*/
2162