blob: 506ed9e400071b6eedc20ced4389e33d740ffdbb [file] [log] [blame]
weidendoa17f2a32006-03-20 10:27:30 +00001
2/*--------------------------------------------------------------------*/
3/*--- Cache simulation. ---*/
4/*--- sim.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
njn9a0cba42007-04-15 22:15:57 +00008 This file is part of Callgrind, a Valgrind tool for call graph
9 profiling programs.
weidendoa17f2a32006-03-20 10:27:30 +000010
njn9a0cba42007-04-15 22:15:57 +000011 Copyright (C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
weidendoa17f2a32006-03-20 10:27:30 +000012
njn9a0cba42007-04-15 22:15:57 +000013 This tool is derived from and contains code from Cachegrind
sewardj4d474d02008-02-11 11:34:59 +000014 Copyright (C) 2002-2008 Nicholas Nethercote (njn@valgrind.org)
weidendoa17f2a32006-03-20 10:27:30 +000015
16 This program is free software; you can redistribute it and/or
17 modify it under the terms of the GNU General Public License as
18 published by the Free Software Foundation; either version 2 of the
19 License, or (at your option) any later version.
20
21 This program is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received a copy of the GNU General Public License
27 along with this program; if not, write to the Free Software
28 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 02111-1307, USA.
30
31 The GNU General Public License is contained in the file COPYING.
32*/
33
34#include "global.h"
35
36
37/* Notes:
38 - simulates a write-allocate cache
39 - (block --> set) hash function uses simple bit selection
40 - handling of references straddling two cache blocks:
41 - counts as only one cache access (not two)
42 - both blocks hit --> one hit
43 - one block hits, the other misses --> one miss
44 - both blocks miss --> one miss (not two)
45*/
46
47/* Cache configuration */
48#include "cg_arch.h"
49
50/* additional structures for cache use info, separated
51 * according usage frequency:
52 * - line_loaded : pointer to cost center of instruction
53 * which loaded the line into cache.
54 * Needed to increment counters when line is evicted.
55 * - line_use : updated on every access
56 */
57typedef struct {
58 UInt count;
59 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
60} line_use;
61
62typedef struct {
63 Addr memline, iaddr;
64 line_use* dep_use; /* point to higher-level cacheblock for this memline */
65 ULong* use_base;
66} line_loaded;
67
68/* Cache state */
69typedef struct {
70 char* name;
71 int size; /* bytes */
72 int assoc;
73 int line_size; /* bytes */
74 Bool sectored; /* prefetch nearside cacheline on read */
75 int sets;
76 int sets_min_1;
77 int assoc_bits;
78 int line_size_bits;
79 int tag_shift;
80 UWord tag_mask;
81 char desc_line[128];
82 UWord* tags;
83
84 /* for cache use */
85 int line_size_mask;
86 int* line_start_mask;
87 int* line_end_mask;
88 line_loaded* loaded;
89 line_use* use;
90} cache_t2;
91
92/*
93 * States of flat caches in our model.
94 * We use a 2-level hierarchy,
95 */
96static cache_t2 I1, D1, L2;
97
98/* Lower bits of cache tags are used as flags for a cache line */
99#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
100#define CACHELINE_DIRTY 1
101
102
103/* Cache simulator Options */
104static Bool clo_simulate_writeback = False;
105static Bool clo_simulate_hwpref = False;
106static Bool clo_simulate_sectors = False;
107static Bool clo_collect_cacheuse = False;
108
109/* Following global vars are setup before by
110 * setup_bbcc()/cachesim_after_bbsetup():
111 *
112 * - Addr bb_base (instruction start address of original BB)
113 * - ULong* cost_base (start of cost array for BB)
114 * - BBCC* nonskipped (only != 0 when in a function not skipped)
115 */
116
117/* Offset to events in event set, used in log_* functions */
118static Int off_D0_Ir;
119static Int off_D1r_Ir;
120static Int off_D1r_Dr;
121static Int off_D1w_Ir;
122static Int off_D1w_Dw;
123static Int off_D2_Ir;
124static Int off_D2_Dr;
125static Int off_D2_Dw;
126
127static Addr bb_base;
128static ULong* cost_base;
129static InstrInfo* current_ii;
130
131/* Cache use offsets */
132/* FIXME: The offsets are only correct because all eventsets get
133 * the "Use" set added first !
134 */
135static Int off_I1_AcCost = 0;
136static Int off_I1_SpLoss = 1;
137static Int off_D1_AcCost = 0;
138static Int off_D1_SpLoss = 1;
139static Int off_L2_AcCost = 2;
140static Int off_L2_SpLoss = 3;
141
142/* Cache access types */
143typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
144
145/* Result of a reference into a flat cache */
146typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
147
148/* Result of a reference into a hierarchical cache model */
149typedef enum {
150 L1_Hit,
151 L2_Hit,
152 MemAccess,
153 WriteBackMemAccess } CacheModelResult;
154
155typedef CacheModelResult (*simcall_type)(Addr, UChar);
156
157static struct {
158 simcall_type I1_Read;
159 simcall_type D1_Read;
160 simcall_type D1_Write;
161} simulator;
162
163/*------------------------------------------------------------*/
164/*--- Cache Simulator Initialization ---*/
165/*------------------------------------------------------------*/
166
167static void cachesim_clearcache(cache_t2* c)
168{
169 Int i;
170
171 for (i = 0; i < c->sets * c->assoc; i++)
172 c->tags[i] = 0;
173 if (c->use) {
174 for (i = 0; i < c->sets * c->assoc; i++) {
175 c->loaded[i].memline = 0;
176 c->loaded[i].use_base = 0;
177 c->loaded[i].dep_use = 0;
178 c->loaded[i].iaddr = 0;
179 c->use[i].mask = 0;
180 c->use[i].count = 0;
181 c->tags[i] = i % c->assoc; /* init lower bits as pointer */
182 }
183 }
184}
185
186static void cacheuse_initcache(cache_t2* c);
187
188/* By this point, the size/assoc/line_size has been checked. */
189static void cachesim_initcache(cache_t config, cache_t2* c)
190{
191 c->size = config.size;
192 c->assoc = config.assoc;
193 c->line_size = config.line_size;
194 c->sectored = False; // FIXME
195
196 c->sets = (c->size / c->line_size) / c->assoc;
197 c->sets_min_1 = c->sets - 1;
198 c->assoc_bits = VG_(log2)(c->assoc);
199 c->line_size_bits = VG_(log2)(c->line_size);
200 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
201 c->tag_mask = ~((1<<c->tag_shift)-1);
202
203 /* Can bits in tag entries be used for flags?
204 * Should be always true as MIN_LINE_SIZE >= 16 */
205 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
206
207 if (c->assoc == 1) {
208 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
209 c->size, c->line_size,
210 c->sectored ? ", sectored":"");
211 } else {
212 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
213 c->size, c->line_size, c->assoc,
214 c->sectored ? ", sectored":"");
215 }
216
sewardj9c606bd2008-09-18 18:12:50 +0000217 c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
218 sizeof(UWord) * c->sets * c->assoc);
weidendoa17f2a32006-03-20 10:27:30 +0000219 if (clo_collect_cacheuse)
220 cacheuse_initcache(c);
221 else
222 c->use = 0;
223 cachesim_clearcache(c);
224}
225
226
227#if 0
228static void print_cache(cache_t2* c)
229{
230 UInt set, way, i;
231
232 /* Note initialisation and update of 'i'. */
233 for (i = 0, set = 0; set < c->sets; set++) {
234 for (way = 0; way < c->assoc; way++, i++) {
235 VG_(printf)("%8x ", c->tags[i]);
236 }
237 VG_(printf)("\n");
238 }
239}
240#endif
241
242
243/*------------------------------------------------------------*/
244/*--- Write Through Cache Simulation ---*/
245/*------------------------------------------------------------*/
246
247/*
248 * Simple model: L1 & L2 Write Through
249 * Does not distinguish among read and write references
250 *
251 * Simulator functions:
252 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
253 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
254 */
255
256static __inline__
257CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
258{
259 int i, j;
260 UWord *set;
261
262 /* Shifting is a bit faster than multiplying */
263 set = &(c->tags[set_no << c->assoc_bits]);
264
265 /* This loop is unrolled for just the first case, which is the most */
266 /* common. We can't unroll any further because it would screw up */
267 /* if we have a direct-mapped (1-way) cache. */
268 if (tag == set[0])
269 return Hit;
270
271 /* If the tag is one other than the MRU, move it into the MRU spot */
272 /* and shuffle the rest down. */
273 for (i = 1; i < c->assoc; i++) {
274 if (tag == set[i]) {
275 for (j = i; j > 0; j--) {
276 set[j] = set[j - 1];
277 }
278 set[0] = tag;
279 return Hit;
280 }
281 }
282
283 /* A miss; install this tag as MRU, shuffle rest down. */
284 for (j = c->assoc - 1; j > 0; j--) {
285 set[j] = set[j - 1];
286 }
287 set[0] = tag;
288
289 return Miss;
290}
291
292static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
293{
294 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
295 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
296 UWord tag = a >> c->tag_shift;
297
298 /* Access entirely within line. */
299 if (set1 == set2)
300 return cachesim_setref(c, set1, tag);
301
302 /* Access straddles two lines. */
303 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
304 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000305 UWord tag2 = (a+size-1) >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000306
307 /* the call updates cache structures as side effect */
308 CacheResult res1 = cachesim_setref(c, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000309 CacheResult res2 = cachesim_setref(c, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000310 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
311
312 } else {
njn8a7b41b2007-09-23 00:51:24 +0000313 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendoa17f2a32006-03-20 10:27:30 +0000314 VG_(tool_panic)("item straddles more than two cache sets");
315 }
316 return Hit;
317}
318
319static
320CacheModelResult cachesim_I1_ref(Addr a, UChar size)
321{
322 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
323 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
324 return MemAccess;
325}
326
327static
328CacheModelResult cachesim_D1_ref(Addr a, UChar size)
329{
330 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
331 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
332 return MemAccess;
333}
334
335
336/*------------------------------------------------------------*/
337/*--- Write Back Cache Simulation ---*/
338/*------------------------------------------------------------*/
339
340/*
341 * More complex model: L1 Write-through, L2 Write-back
342 * This needs to distinguish among read and write references.
343 *
344 * Simulator functions:
345 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
346 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
347 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
348 */
349
350/*
351 * With write-back, result can be a miss evicting a dirty line
352 * The dirty state of a cache line is stored in Bit0 of the tag for
353 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
354 * type (Read/Write), the line gets dirty on a write.
355 */
356static __inline__
357CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
358{
359 int i, j;
360 UWord *set, tmp_tag;
361
362 /* Shifting is a bit faster than multiplying */
363 set = &(c->tags[set_no << c->assoc_bits]);
364
365 /* This loop is unrolled for just the first case, which is the most */
366 /* common. We can't unroll any further because it would screw up */
367 /* if we have a direct-mapped (1-way) cache. */
368 if (tag == (set[0] & ~CACHELINE_DIRTY)) {
369 set[0] |= ref;
370 return Hit;
371 }
372 /* If the tag is one other than the MRU, move it into the MRU spot */
373 /* and shuffle the rest down. */
374 for (i = 1; i < c->assoc; i++) {
375 if (tag == (set[i] & ~CACHELINE_DIRTY)) {
376 tmp_tag = set[i] | ref; // update dirty flag
377 for (j = i; j > 0; j--) {
378 set[j] = set[j - 1];
379 }
380 set[0] = tmp_tag;
381 return Hit;
382 }
383 }
384
385 /* A miss; install this tag as MRU, shuffle rest down. */
386 tmp_tag = set[c->assoc - 1];
387 for (j = c->assoc - 1; j > 0; j--) {
388 set[j] = set[j - 1];
389 }
390 set[0] = tag | ref;
391
392 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
393}
394
395
396static __inline__
397CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
398{
399 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
400 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
401 UWord tag = a & c->tag_mask;
402
403 /* Access entirely within line. */
404 if (set1 == set2)
405 return cachesim_setref_wb(c, ref, set1, tag);
406
407 /* Access straddles two lines. */
408 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
409 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000410 UWord tag2 = (a+size-1) >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000411
412 /* the call updates cache structures as side effect */
413 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000414 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000415
416 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
417 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
418
419 } else {
njn8a7b41b2007-09-23 00:51:24 +0000420 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendoa17f2a32006-03-20 10:27:30 +0000421 VG_(tool_panic)("item straddles more than two cache sets");
422 }
423 return Hit;
424}
425
426
427static
428CacheModelResult cachesim_I1_Read(Addr a, UChar size)
429{
430 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
431 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
432 case Hit: return L2_Hit;
433 case Miss: return MemAccess;
434 default: break;
435 }
436 return WriteBackMemAccess;
437}
438
439static
440CacheModelResult cachesim_D1_Read(Addr a, UChar size)
441{
442 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
443 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
444 case Hit: return L2_Hit;
445 case Miss: return MemAccess;
446 default: break;
447 }
448 return WriteBackMemAccess;
449}
450
451static
452CacheModelResult cachesim_D1_Write(Addr a, UChar size)
453{
454 if ( cachesim_ref( &D1, a, size) == Hit ) {
455 /* Even for a L1 hit, the write-trough L1 passes
456 * the write to the L2 to make the L2 line dirty.
457 * But this causes no latency, so return the hit.
458 */
459 cachesim_ref_wb( &L2, Write, a, size);
460 return L1_Hit;
461 }
462 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
463 case Hit: return L2_Hit;
464 case Miss: return MemAccess;
465 default: break;
466 }
467 return WriteBackMemAccess;
468}
469
470
471/*------------------------------------------------------------*/
472/*--- Hardware Prefetch Simulation ---*/
473/*------------------------------------------------------------*/
474
475static ULong prefetch_up = 0;
476static ULong prefetch_down = 0;
477
478#define PF_STREAMS 8
479#define PF_PAGEBITS 12
480
481static UInt pf_lastblock[PF_STREAMS];
482static Int pf_seqblocks[PF_STREAMS];
483
484static
485void prefetch_clear(void)
486{
487 int i;
488 for(i=0;i<PF_STREAMS;i++)
489 pf_lastblock[i] = pf_seqblocks[i] = 0;
490}
491
492/*
493 * HW Prefetch emulation
494 * Start prefetching when detecting sequential access to 3 memory blocks.
495 * One stream can be detected per 4k page.
496 */
497static __inline__
498void prefetch_L2_doref(Addr a, UChar size)
499{
500 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
501 UInt block = ( a >> L2.line_size_bits);
502
503 if (block != pf_lastblock[stream]) {
504 if (pf_seqblocks[stream] == 0) {
505 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
506 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
507 }
508 else if (pf_seqblocks[stream] >0) {
509 if (pf_lastblock[stream] +1 == block) {
510 pf_seqblocks[stream]++;
511 if (pf_seqblocks[stream] >= 2) {
512 prefetch_up++;
513 cachesim_ref(&L2, a + 5 * L2.line_size,1);
514 }
515 }
516 else pf_seqblocks[stream] = 0;
517 }
518 else if (pf_seqblocks[stream] <0) {
519 if (pf_lastblock[stream] -1 == block) {
520 pf_seqblocks[stream]--;
521 if (pf_seqblocks[stream] <= -2) {
522 prefetch_down++;
523 cachesim_ref(&L2, a - 5 * L2.line_size,1);
524 }
525 }
526 else pf_seqblocks[stream] = 0;
527 }
528 pf_lastblock[stream] = block;
529 }
530}
531
532/* simple model with hardware prefetch */
533
534static
535CacheModelResult prefetch_I1_ref(Addr a, UChar size)
536{
537 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
538 prefetch_L2_doref(a,size);
539 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
540 return MemAccess;
541}
542
543static
544CacheModelResult prefetch_D1_ref(Addr a, UChar size)
545{
546 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
547 prefetch_L2_doref(a,size);
548 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
549 return MemAccess;
550}
551
552
553/* complex model with hardware prefetch */
554
555static
556CacheModelResult prefetch_I1_Read(Addr a, UChar size)
557{
558 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
559 prefetch_L2_doref(a,size);
560 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
561 case Hit: return L2_Hit;
562 case Miss: return MemAccess;
563 default: break;
564 }
565 return WriteBackMemAccess;
566}
567
568static
569CacheModelResult prefetch_D1_Read(Addr a, UChar size)
570{
571 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
572 prefetch_L2_doref(a,size);
573 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
574 case Hit: return L2_Hit;
575 case Miss: return MemAccess;
576 default: break;
577 }
578 return WriteBackMemAccess;
579}
580
581static
582CacheModelResult prefetch_D1_Write(Addr a, UChar size)
583{
584 prefetch_L2_doref(a,size);
585 if ( cachesim_ref( &D1, a, size) == Hit ) {
586 /* Even for a L1 hit, the write-trough L1 passes
587 * the write to the L2 to make the L2 line dirty.
588 * But this causes no latency, so return the hit.
589 */
590 cachesim_ref_wb( &L2, Write, a, size);
591 return L1_Hit;
592 }
593 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
594 case Hit: return L2_Hit;
595 case Miss: return MemAccess;
596 default: break;
597 }
598 return WriteBackMemAccess;
599}
600
601
602/*------------------------------------------------------------*/
603/*--- Cache Simulation with use metric collection ---*/
604/*------------------------------------------------------------*/
605
606/* can not be combined with write-back or prefetch */
607
608static
609void cacheuse_initcache(cache_t2* c)
610{
611 int i;
612 unsigned int start_mask, start_val;
613 unsigned int end_mask, end_val;
614
sewardj9c606bd2008-09-18 18:12:50 +0000615 c->use = CLG_MALLOC("cl.sim.cu_ic.1",
616 sizeof(line_use) * c->sets * c->assoc);
617 c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
618 sizeof(line_loaded) * c->sets * c->assoc);
619 c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
620 sizeof(int) * c->line_size);
621 c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
622 sizeof(int) * c->line_size);
weidendoa17f2a32006-03-20 10:27:30 +0000623
weidendoa17f2a32006-03-20 10:27:30 +0000624 c->line_size_mask = c->line_size-1;
625
626 /* Meaning of line_start_mask/line_end_mask
627 * Example: for a given cache line, you get an access starting at
628 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
629 * line size of 32, you have 1 bit per byte in the mask:
630 *
631 * bit31 bit8 bit5 bit 0
632 * | | | |
633 * 11..111111100000 line_start_mask[5]
634 * 00..000111111111 line_end_mask[(5+4)-1]
635 *
636 * use_mask |= line_start_mask[5] && line_end_mask[8]
637 *
638 */
639 start_val = end_val = ~0;
640 if (c->line_size < 32) {
641 int bits_per_byte = 32/c->line_size;
642 start_mask = (1<<bits_per_byte)-1;
643 end_mask = start_mask << (32-bits_per_byte);
644 for(i=0;i<c->line_size;i++) {
645 c->line_start_mask[i] = start_val;
646 start_val = start_val & ~start_mask;
647 start_mask = start_mask << bits_per_byte;
648
649 c->line_end_mask[c->line_size-i-1] = end_val;
650 end_val = end_val & ~end_mask;
651 end_mask = end_mask >> bits_per_byte;
652 }
653 }
654 else {
655 int bytes_per_bit = c->line_size/32;
656 start_mask = 1;
657 end_mask = 1 << 31;
658 for(i=0;i<c->line_size;i++) {
659 c->line_start_mask[i] = start_val;
660 c->line_end_mask[c->line_size-i-1] = end_val;
661 if ( ((i+1)%bytes_per_bit) == 0) {
662 start_val &= ~start_mask;
663 end_val &= ~end_mask;
664 start_mask <<= 1;
665 end_mask >>= 1;
666 }
667 }
668 }
669
670 CLG_DEBUG(6, "Config %s:\n", c->desc_line);
671 for(i=0;i<c->line_size;i++) {
672 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
673 i, c->line_start_mask[i], c->line_end_mask[i]);
674 }
675
676 /* We use lower tag bits as offset pointers to cache use info.
677 * I.e. some cache parameters don't work.
678 */
679 if (c->tag_shift < c->assoc_bits) {
680 VG_(message)(Vg_DebugMsg,
681 "error: Use associativity < %d for cache use statistics!",
682 (1<<c->tag_shift) );
683 VG_(tool_panic)("Unsupported cache configuration");
684 }
685}
686
687/* FIXME: A little tricky */
688#if 0
689
690static __inline__
691void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
692{
693 int idx = (high_idx << c->assoc_bits) | low_idx;
694
695 c->use[idx].count ++;
696 c->use[idx].mask |= use_mask;
697
barta0b6b2c2008-07-07 06:49:24 +0000698 CLG_DEBUG(6," Hit [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +0000699 idx, c->loaded[idx].memline, c->loaded[idx].iaddr,
700 use_mask, c->use[idx].mask, c->use[idx].count);
701}
702
703/* only used for I1, D1 */
704
705static __inline__
706CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
707{
708 int i, j, idx;
709 UWord *set, tmp_tag;
710 UInt use_mask;
711
712 /* Shifting is a bit faster than multiplying */
713 set = &(c->tags[set_no << c->assoc_bits]);
714 use_mask =
715 c->line_start_mask[a & c->line_size_mask] &
716 c->line_end_mask[(a+size-1) & c->line_size_mask];
717
718 /* This loop is unrolled for just the first case, which is the most */
719 /* common. We can't unroll any further because it would screw up */
720 /* if we have a direct-mapped (1-way) cache. */
721 if (tag == (set[0] & c->tag_mask)) {
722 cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
723 return L1_Hit;
724 }
725
726 /* If the tag is one other than the MRU, move it into the MRU spot */
727 /* and shuffle the rest down. */
728 for (i = 1; i < c->assoc; i++) {
729 if (tag == (set[i] & c->tag_mask)) {
730 tmp_tag = set[i];
731 for (j = i; j > 0; j--) {
732 set[j] = set[j - 1];
733 }
734 set[0] = tmp_tag;
735
736 cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
737 return L1_Hit;
738 }
739 }
740
741 /* A miss; install this tag as MRU, shuffle rest down. */
742 tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
743 for (j = c->assoc - 1; j > 0; j--) {
744 set[j] = set[j - 1];
745 }
746 set[0] = tag | tmp_tag;
747
748 cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
749 use_mask, a & ~c->line_size_mask);
750
751 return Miss;
752}
753
754
755static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
756{
757 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
758 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
759 UWord tag = a >> c->tag_shift;
760
761 /* Access entirely within line. */
762 if (set1 == set2)
763 return cacheuse_setref(c, set1, tag);
764
765 /* Access straddles two lines. */
766 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
767 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000768 UWord tag2 = a >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000769
770 /* the call updates cache structures as side effect */
771 CacheResult res1 = cacheuse_isMiss(c, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000772 CacheResult res2 = cacheuse_isMiss(c, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000773 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
774
775 } else {
776 VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
777 VG_(tool_panic)("item straddles more than two cache sets");
778 }
779 return Hit;
780}
781#endif
782
783
784/* for I1/D1 caches */
785#define CACHEUSE(L) \
786 \
787static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
788{ \
weidendo28e2a142006-11-22 21:00:53 +0000789 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
790 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
791 UWord tag = a & L.tag_mask; \
792 UWord tag2; \
weidendoa17f2a32006-03-20 10:27:30 +0000793 int i, j, idx; \
794 UWord *set, tmp_tag; \
795 UInt use_mask; \
796 \
barta0b6b2c2008-07-07 06:49:24 +0000797 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \
weidendoa17f2a32006-03-20 10:27:30 +0000798 L.name, a, size, set1, set2); \
799 \
800 /* First case: word entirely within line. */ \
801 if (set1 == set2) { \
802 \
803 /* Shifting is a bit faster than multiplying */ \
804 set = &(L.tags[set1 << L.assoc_bits]); \
805 use_mask = L.line_start_mask[a & L.line_size_mask] & \
806 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
807 \
808 /* This loop is unrolled for just the first case, which is the most */\
809 /* common. We can't unroll any further because it would screw up */\
810 /* if we have a direct-mapped (1-way) cache. */\
811 if (tag == (set[0] & L.tag_mask)) { \
812 idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
813 L.use[idx].count ++; \
814 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000815 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000816 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
817 use_mask, L.use[idx].mask, L.use[idx].count); \
818 return L1_Hit; \
819 } \
820 /* If the tag is one other than the MRU, move it into the MRU spot */\
821 /* and shuffle the rest down. */\
822 for (i = 1; i < L.assoc; i++) { \
823 if (tag == (set[i] & L.tag_mask)) { \
824 tmp_tag = set[i]; \
825 for (j = i; j > 0; j--) { \
826 set[j] = set[j - 1]; \
827 } \
828 set[0] = tmp_tag; \
829 idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
830 L.use[idx].count ++; \
831 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000832 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000833 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
834 use_mask, L.use[idx].mask, L.use[idx].count); \
835 return L1_Hit; \
836 } \
837 } \
838 \
839 /* A miss; install this tag as MRU, shuffle rest down. */ \
840 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
841 for (j = L.assoc - 1; j > 0; j--) { \
842 set[j] = set[j - 1]; \
843 } \
844 set[0] = tag | tmp_tag; \
845 idx = (set1 << L.assoc_bits) | tmp_tag; \
846 return update_##L##_use(&L, idx, \
847 use_mask, a &~ L.line_size_mask); \
848 \
849 /* Second case: word straddles two lines. */ \
850 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
851 } else if (((set1 + 1) & (L.sets-1)) == set2) { \
852 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
853 set = &(L.tags[set1 << L.assoc_bits]); \
854 use_mask = L.line_start_mask[a & L.line_size_mask]; \
855 if (tag == (set[0] & L.tag_mask)) { \
856 idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
857 L.use[idx].count ++; \
858 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000859 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000860 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
861 use_mask, L.use[idx].mask, L.use[idx].count); \
862 goto block2; \
863 } \
864 for (i = 1; i < L.assoc; i++) { \
865 if (tag == (set[i] & L.tag_mask)) { \
866 tmp_tag = set[i]; \
867 for (j = i; j > 0; j--) { \
868 set[j] = set[j - 1]; \
869 } \
870 set[0] = tmp_tag; \
871 idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
872 L.use[idx].count ++; \
873 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000874 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000875 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
876 use_mask, L.use[idx].mask, L.use[idx].count); \
877 goto block2; \
878 } \
879 } \
880 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
881 for (j = L.assoc - 1; j > 0; j--) { \
882 set[j] = set[j - 1]; \
883 } \
884 set[0] = tag | tmp_tag; \
885 idx = (set1 << L.assoc_bits) | tmp_tag; \
886 miss1 = update_##L##_use(&L, idx, \
887 use_mask, a &~ L.line_size_mask); \
888block2: \
889 set = &(L.tags[set2 << L.assoc_bits]); \
890 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
weidendo28e2a142006-11-22 21:00:53 +0000891 tag2 = (a+size-1) & L.tag_mask; \
892 if (tag2 == (set[0] & L.tag_mask)) { \
weidendoa17f2a32006-03-20 10:27:30 +0000893 idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
894 L.use[idx].count ++; \
895 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000896 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000897 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
898 use_mask, L.use[idx].mask, L.use[idx].count); \
899 return miss1; \
900 } \
901 for (i = 1; i < L.assoc; i++) { \
weidendo28e2a142006-11-22 21:00:53 +0000902 if (tag2 == (set[i] & L.tag_mask)) { \
weidendoa17f2a32006-03-20 10:27:30 +0000903 tmp_tag = set[i]; \
904 for (j = i; j > 0; j--) { \
905 set[j] = set[j - 1]; \
906 } \
907 set[0] = tmp_tag; \
908 idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
909 L.use[idx].count ++; \
910 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000911 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000912 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
913 use_mask, L.use[idx].mask, L.use[idx].count); \
914 return miss1; \
915 } \
916 } \
917 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
918 for (j = L.assoc - 1; j > 0; j--) { \
919 set[j] = set[j - 1]; \
920 } \
weidendo28e2a142006-11-22 21:00:53 +0000921 set[0] = tag2 | tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000922 idx = (set2 << L.assoc_bits) | tmp_tag; \
923 miss2 = update_##L##_use(&L, idx, \
924 use_mask, (a+size-1) &~ L.line_size_mask); \
925 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
926 \
927 } else { \
barta0b6b2c2008-07-07 06:49:24 +0000928 VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
weidendoa17f2a32006-03-20 10:27:30 +0000929 VG_(tool_panic)("item straddles more than two cache sets"); \
930 } \
931 return 0; \
932}
933
934
935/* logarithmic bitcounting algorithm, see
936 * http://graphics.stanford.edu/~seander/bithacks.html
937 */
938static __inline__ unsigned int countBits(unsigned int bits)
939{
940 unsigned int c; // store the total here
941 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
942 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
943
944 c = bits;
945 c = ((c >> S[0]) & B[0]) + (c & B[0]);
946 c = ((c >> S[1]) & B[1]) + (c & B[1]);
947 c = ((c >> S[2]) & B[2]) + (c & B[2]);
948 c = ((c >> S[3]) & B[3]) + (c & B[3]);
949 c = ((c >> S[4]) & B[4]) + (c & B[4]);
950 return c;
951}
952
953static void update_L2_use(int idx, Addr memline)
954{
955 line_loaded* loaded = &(L2.loaded[idx]);
956 line_use* use = &(L2.use[idx]);
957 int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
958
barta0b6b2c2008-07-07 06:49:24 +0000959 CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
weidendoa17f2a32006-03-20 10:27:30 +0000960 idx, bb_base + current_ii->instr_offset, memline);
961 if (use->count>0) {
barta0b6b2c2008-07-07 06:49:24 +0000962 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
weidendoa17f2a32006-03-20 10:27:30 +0000963 use->count, i, use->mask, loaded->memline, loaded->iaddr);
964 CLG_DEBUG(2, " collect: %d, use_base %p\n",
965 CLG_(current_state).collect, loaded->use_base);
966
967 if (CLG_(current_state).collect && loaded->use_base) {
968 (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
969 (loaded->use_base)[off_L2_SpLoss] += i;
970 }
971 }
972
973 use->count = 0;
974 use->mask = 0;
975
976 loaded->memline = memline;
977 loaded->iaddr = bb_base + current_ii->instr_offset;
978 loaded->use_base = (CLG_(current_state).nonskipped) ?
979 CLG_(current_state).nonskipped->skipped :
980 cost_base + current_ii->cost_offset;
981}
982
983static
984CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
985{
986 UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
987 UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
988 UWord tag = memline & L2.tag_mask;
989
990 int i, j, idx;
991 UWord tmp_tag;
992
barta0b6b2c2008-07-07 06:49:24 +0000993 CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
weidendoa17f2a32006-03-20 10:27:30 +0000994
995 if (tag == (set[0] & L2.tag_mask)) {
996 idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
997 l1_loaded->dep_use = &(L2.use[idx]);
998
barta0b6b2c2008-07-07 06:49:24 +0000999 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001000 idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
1001 L2.use[idx].mask, L2.use[idx].count);
1002 return L2_Hit;
1003 }
1004 for (i = 1; i < L2.assoc; i++) {
1005 if (tag == (set[i] & L2.tag_mask)) {
1006 tmp_tag = set[i];
1007 for (j = i; j > 0; j--) {
1008 set[j] = set[j - 1];
1009 }
1010 set[0] = tmp_tag;
1011 idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
1012 l1_loaded->dep_use = &(L2.use[idx]);
1013
barta0b6b2c2008-07-07 06:49:24 +00001014 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001015 i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
1016 L2.use[idx].mask, L2.use[idx].count);
1017 return L2_Hit;
1018 }
1019 }
1020
1021 /* A miss; install this tag as MRU, shuffle rest down. */
1022 tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
1023 for (j = L2.assoc - 1; j > 0; j--) {
1024 set[j] = set[j - 1];
1025 }
1026 set[0] = tag | tmp_tag;
1027 idx = (setNo << L2.assoc_bits) | tmp_tag;
1028 l1_loaded->dep_use = &(L2.use[idx]);
1029
1030 update_L2_use(idx, memline);
1031
1032 return MemAccess;
1033}
1034
1035
1036
1037
1038#define UPDATE_USE(L) \
1039 \
1040static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
1041 UInt mask, Addr memline) \
1042{ \
1043 line_loaded* loaded = &(cache->loaded[idx]); \
1044 line_use* use = &(cache->use[idx]); \
1045 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
1046 \
barta0b6b2c2008-07-07 06:49:24 +00001047 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
weidendoa17f2a32006-03-20 10:27:30 +00001048 cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
1049 if (use->count>0) { \
barta0b6b2c2008-07-07 06:49:24 +00001050 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
weidendoa17f2a32006-03-20 10:27:30 +00001051 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
1052 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
1053 CLG_(current_state).collect, loaded->use_base); \
1054 \
1055 if (CLG_(current_state).collect && loaded->use_base) { \
1056 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
1057 (loaded->use_base)[off_##L##_SpLoss] += c; \
1058 \
1059 /* FIXME (?): L1/L2 line sizes must be equal ! */ \
1060 loaded->dep_use->mask |= use->mask; \
1061 loaded->dep_use->count += use->count; \
1062 } \
1063 } \
1064 \
1065 use->count = 1; \
1066 use->mask = mask; \
1067 loaded->memline = memline; \
1068 loaded->iaddr = bb_base + current_ii->instr_offset; \
1069 loaded->use_base = (CLG_(current_state).nonskipped) ? \
1070 CLG_(current_state).nonskipped->skipped : \
1071 cost_base + current_ii->cost_offset; \
1072 \
1073 if (memline == 0) return L2_Hit; \
1074 return cacheuse_L2_access(memline, loaded); \
1075}
1076
1077UPDATE_USE(I1);
1078UPDATE_USE(D1);
1079
1080CACHEUSE(I1);
1081CACHEUSE(D1);
1082
1083
1084static
1085void cacheuse_finish(void)
1086{
1087 int i;
1088 InstrInfo ii = { 0,0,0,0,0 };
1089
1090 if (!CLG_(current_state).collect) return;
1091
1092 bb_base = 0;
1093 current_ii = &ii;
1094 cost_base = 0;
1095
1096 /* update usage counters */
1097 if (I1.use)
1098 for (i = 0; i < I1.sets * I1.assoc; i++)
1099 if (I1.loaded[i].use_base)
1100 update_I1_use( &I1, i, 0,0);
1101
1102 if (D1.use)
1103 for (i = 0; i < D1.sets * D1.assoc; i++)
1104 if (D1.loaded[i].use_base)
1105 update_D1_use( &D1, i, 0,0);
1106
1107 if (L2.use)
1108 for (i = 0; i < L2.sets * L2.assoc; i++)
1109 if (L2.loaded[i].use_base)
1110 update_L2_use(i, 0);
1111}
1112
1113
1114
1115/*------------------------------------------------------------*/
1116/*--- Helper functions called by instrumented code ---*/
1117/*------------------------------------------------------------*/
1118
1119
1120static __inline__
1121void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1122{
1123 switch(r) {
1124 case WriteBackMemAccess:
1125 if (clo_simulate_writeback) {
1126 c1[3]++;
1127 c2[3]++;
1128 }
1129 // fall through
1130
1131 case MemAccess:
1132 c1[2]++;
1133 c2[2]++;
1134 // fall through
1135
1136 case L2_Hit:
1137 c1[1]++;
1138 c2[1]++;
1139 // fall through
1140
1141 default:
1142 c1[0]++;
1143 c2[0]++;
1144 }
1145}
1146
1147
1148VG_REGPARM(1)
1149static void log_1I0D(InstrInfo* ii)
1150{
1151 CacheModelResult IrRes;
1152
1153 current_ii = ii;
1154 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1155
barta0b6b2c2008-07-07 06:49:24 +00001156 CLG_DEBUG(6, "log_1I0D: Ir=%#lx/%u => Ir %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001157 bb_base + ii->instr_offset, ii->instr_size, IrRes);
1158
1159 if (CLG_(current_state).collect) {
1160 ULong* cost_Ir;
1161
1162 if (CLG_(current_state).nonskipped)
1163 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1164 else
1165 cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
1166
1167 inc_costs(IrRes, cost_Ir,
1168 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1169 }
1170}
1171
1172
1173/* Instruction doing a read access */
1174
1175VG_REGPARM(2)
1176static void log_1I1Dr(InstrInfo* ii, Addr data)
1177{
1178 CacheModelResult IrRes, DrRes;
1179
1180 current_ii = ii;
1181 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1182 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1183
barta0b6b2c2008-07-07 06:49:24 +00001184 CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001185 bb_base + ii->instr_offset, ii->instr_size,
1186 data, ii->data_size, IrRes, DrRes);
1187
1188 if (CLG_(current_state).collect) {
1189 ULong *cost_Ir, *cost_Dr;
1190
1191 if (CLG_(current_state).nonskipped) {
1192 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1193 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1194 }
1195 else {
1196 cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
1197 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1198 }
1199
1200 inc_costs(IrRes, cost_Ir,
1201 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1202 inc_costs(DrRes, cost_Dr,
1203 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1204 }
1205}
1206
1207
1208VG_REGPARM(2)
1209static void log_0I1Dr(InstrInfo* ii, Addr data)
1210{
1211 CacheModelResult DrRes;
1212
1213 current_ii = ii;
1214 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1215
barta0b6b2c2008-07-07 06:49:24 +00001216 CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001217 data, ii->data_size, DrRes);
1218
1219 if (CLG_(current_state).collect) {
1220 ULong *cost_Dr;
1221
1222 if (CLG_(current_state).nonskipped) {
1223 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1224 }
1225 else {
1226 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1227 }
1228
1229 inc_costs(DrRes, cost_Dr,
1230 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1231 }
1232}
1233
1234
1235/* Instruction doing a write access */
1236
1237VG_REGPARM(2)
1238static void log_1I1Dw(InstrInfo* ii, Addr data)
1239{
1240 CacheModelResult IrRes, DwRes;
1241
1242 current_ii = ii;
1243 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1244 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1245
barta0b6b2c2008-07-07 06:49:24 +00001246 CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001247 bb_base + ii->instr_offset, ii->instr_size,
1248 data, ii->data_size, IrRes, DwRes);
1249
1250 if (CLG_(current_state).collect) {
1251 ULong *cost_Ir, *cost_Dw;
1252
1253 if (CLG_(current_state).nonskipped) {
1254 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1255 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1256 }
1257 else {
1258 cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
1259 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1260 }
1261
1262 inc_costs(IrRes, cost_Ir,
1263 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1264 inc_costs(DwRes, cost_Dw,
1265 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1266 }
1267}
1268
1269VG_REGPARM(2)
1270static void log_0I1Dw(InstrInfo* ii, Addr data)
1271{
1272 CacheModelResult DwRes;
1273
1274 current_ii = ii;
1275 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1276
barta0b6b2c2008-07-07 06:49:24 +00001277 CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001278 data, ii->data_size, DwRes);
1279
1280 if (CLG_(current_state).collect) {
1281 ULong *cost_Dw;
1282
1283 if (CLG_(current_state).nonskipped) {
1284 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1285 }
1286 else {
1287 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1288 }
1289
1290 inc_costs(DwRes, cost_Dw,
1291 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1292 }
1293}
1294
1295/* Instruction doing a read and a write access */
1296
1297VG_REGPARM(3)
1298static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
1299{
1300 CacheModelResult IrRes, DrRes, DwRes;
1301
1302 current_ii = ii;
1303 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1304 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1305 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1306
1307 CLG_DEBUG(6,
barta0b6b2c2008-07-07 06:49:24 +00001308 "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001309 bb_base + ii->instr_offset, ii->instr_size,
1310 data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
1311
1312 if (CLG_(current_state).collect) {
1313 ULong *cost_Ir, *cost_Dr, *cost_Dw;
1314
1315 if (CLG_(current_state).nonskipped) {
1316 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1317 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1318 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1319 }
1320 else {
1321 cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
1322 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1323 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1324 }
1325
1326 inc_costs(IrRes, cost_Ir,
1327 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1328 inc_costs(DrRes, cost_Dr,
1329 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1330 inc_costs(DwRes, cost_Dw,
1331 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1332 }
1333}
1334
1335VG_REGPARM(3)
1336static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
1337{
1338 CacheModelResult DrRes, DwRes;
1339
1340 current_ii = ii;
1341 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1342 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1343
1344 CLG_DEBUG(6,
barta0b6b2c2008-07-07 06:49:24 +00001345 "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001346 data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
1347
1348 if (CLG_(current_state).collect) {
1349 ULong *cost_Dr, *cost_Dw;
1350
1351 if (CLG_(current_state).nonskipped) {
1352 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1353 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1354 }
1355 else {
1356 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1357 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1358 }
1359
1360 inc_costs(DrRes, cost_Dr,
1361 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1362 inc_costs(DwRes, cost_Dw,
1363 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1364 }
1365}
1366
1367
1368/*------------------------------------------------------------*/
1369/*--- Cache configuration ---*/
1370/*------------------------------------------------------------*/
1371
1372#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
1373
1374static cache_t clo_I1_cache = UNDEFINED_CACHE;
1375static cache_t clo_D1_cache = UNDEFINED_CACHE;
1376static cache_t clo_L2_cache = UNDEFINED_CACHE;
1377
1378
1379/* Checks cache config is ok; makes it so if not. */
1380static
1381void check_cache(cache_t* cache, Char *name)
1382{
1383 /* First check they're all powers of two */
1384 if (-1 == VG_(log2)(cache->size)) {
1385 VG_(message)(Vg_UserMsg,
1386 "error: %s size of %dB not a power of two; aborting.",
1387 name, cache->size);
1388 VG_(exit)(1);
1389 }
1390
1391 if (-1 == VG_(log2)(cache->assoc)) {
1392 VG_(message)(Vg_UserMsg,
1393 "error: %s associativity of %d not a power of two; aborting.",
1394 name, cache->assoc);
1395 VG_(exit)(1);
1396 }
1397
1398 if (-1 == VG_(log2)(cache->line_size)) {
1399 VG_(message)(Vg_UserMsg,
1400 "error: %s line size of %dB not a power of two; aborting.",
1401 name, cache->line_size);
1402 VG_(exit)(1);
1403 }
1404
1405 // Then check line size >= 16 -- any smaller and a single instruction could
1406 // straddle three cache lines, which breaks a simulation assertion and is
1407 // stupid anyway.
1408 if (cache->line_size < MIN_LINE_SIZE) {
1409 VG_(message)(Vg_UserMsg,
1410 "error: %s line size of %dB too small; aborting.",
1411 name, cache->line_size);
1412 VG_(exit)(1);
1413 }
1414
1415 /* Then check cache size > line size (causes seg faults if not). */
1416 if (cache->size <= cache->line_size) {
1417 VG_(message)(Vg_UserMsg,
1418 "error: %s cache size of %dB <= line size of %dB; aborting.",
1419 name, cache->size, cache->line_size);
1420 VG_(exit)(1);
1421 }
1422
1423 /* Then check assoc <= (size / line size) (seg faults otherwise). */
1424 if (cache->assoc > (cache->size / cache->line_size)) {
1425 VG_(message)(Vg_UserMsg,
1426 "warning: %s associativity > (size / line size); aborting.", name);
1427 VG_(exit)(1);
1428 }
1429}
1430
1431static
1432void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1433{
1434#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
1435
1436 Int n_clos = 0;
1437
1438 // Count how many were defined on the command line.
1439 if (DEFINED(clo_I1_cache)) { n_clos++; }
1440 if (DEFINED(clo_D1_cache)) { n_clos++; }
1441 if (DEFINED(clo_L2_cache)) { n_clos++; }
1442
1443 // Set the cache config (using auto-detection, if supported by the
1444 // architecture)
1445 VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1446
1447 // Then replace with any defined on the command line.
1448 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1449 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1450 if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1451
1452 // Then check values and fix if not acceptable.
1453 check_cache(I1c, "I1");
1454 check_cache(D1c, "D1");
1455 check_cache(L2c, "L2");
1456
1457 if (VG_(clo_verbosity) > 1) {
1458 VG_(message)(Vg_UserMsg, "Cache configuration used:");
1459 VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
1460 I1c->size, I1c->assoc, I1c->line_size);
1461 VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
1462 D1c->size, D1c->assoc, D1c->line_size);
1463 VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
1464 L2c->size, L2c->assoc, L2c->line_size);
1465 }
1466#undef CMD_LINE_DEFINED
1467}
1468
1469
1470/* Initialize and clear simulator state */
1471static void cachesim_post_clo_init(void)
1472{
1473 /* Cache configurations. */
1474 cache_t I1c, D1c, L2c;
1475
1476 /* Initialize access handlers */
1477 if (!CLG_(clo).simulate_cache) {
1478 CLG_(cachesim).log_1I0D = 0;
1479 CLG_(cachesim).log_1I0D_name = "(no function)";
1480
1481 CLG_(cachesim).log_1I1Dr = 0;
1482 CLG_(cachesim).log_1I1Dw = 0;
1483 CLG_(cachesim).log_1I2D = 0;
1484 CLG_(cachesim).log_1I1Dr_name = "(no function)";
1485 CLG_(cachesim).log_1I1Dw_name = "(no function)";
1486 CLG_(cachesim).log_1I2D_name = "(no function)";
1487
1488 CLG_(cachesim).log_0I1Dr = 0;
1489 CLG_(cachesim).log_0I1Dw = 0;
1490 CLG_(cachesim).log_0I2D = 0;
1491 CLG_(cachesim).log_0I1Dr_name = "(no function)";
1492 CLG_(cachesim).log_0I1Dw_name = "(no function)";
1493 CLG_(cachesim).log_0I2D_name = "(no function)";
1494 return;
1495 }
1496
1497 /* Configuration of caches only needed with real cache simulation */
1498 configure_caches(&I1c, &D1c, &L2c);
1499
1500 I1.name = "I1";
1501 D1.name = "D1";
1502 L2.name = "L2";
1503
1504 cachesim_initcache(I1c, &I1);
1505 cachesim_initcache(D1c, &D1);
1506 cachesim_initcache(L2c, &L2);
1507
1508 /* the other cache simulators use the standard helpers
1509 * with dispatching via simulator struct */
1510
1511 CLG_(cachesim).log_1I0D = log_1I0D;
1512 CLG_(cachesim).log_1I0D_name = "log_1I0D";
1513
1514 CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1515 CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1516 CLG_(cachesim).log_1I2D = log_1I2D;
1517 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1518 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1519 CLG_(cachesim).log_1I2D_name = "log_1I2D";
1520
1521 CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1522 CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1523 CLG_(cachesim).log_0I2D = log_0I2D;
1524 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1525 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1526 CLG_(cachesim).log_0I2D_name = "log_0I2D";
1527
1528 if (clo_collect_cacheuse) {
1529
1530 /* Output warning for not supported option combinations */
1531 if (clo_simulate_hwpref) {
1532 VG_(message)(Vg_DebugMsg,
1533 "warning: prefetch simulation can not be used with cache usage");
1534 clo_simulate_hwpref = False;
1535 }
1536
1537 if (clo_simulate_writeback) {
1538 VG_(message)(Vg_DebugMsg,
1539 "warning: write-back simulation can not be used with cache usage");
1540 clo_simulate_writeback = False;
1541 }
1542
1543 simulator.I1_Read = cacheuse_I1_doRead;
1544 simulator.D1_Read = cacheuse_D1_doRead;
1545 simulator.D1_Write = cacheuse_D1_doRead;
1546 return;
1547 }
1548
1549 if (clo_simulate_hwpref) {
1550 prefetch_clear();
1551
1552 if (clo_simulate_writeback) {
1553 simulator.I1_Read = prefetch_I1_Read;
1554 simulator.D1_Read = prefetch_D1_Read;
1555 simulator.D1_Write = prefetch_D1_Write;
1556 }
1557 else {
1558 simulator.I1_Read = prefetch_I1_ref;
1559 simulator.D1_Read = prefetch_D1_ref;
1560 simulator.D1_Write = prefetch_D1_ref;
1561 }
1562
1563 return;
1564 }
1565
1566 if (clo_simulate_writeback) {
1567 simulator.I1_Read = cachesim_I1_Read;
1568 simulator.D1_Read = cachesim_D1_Read;
1569 simulator.D1_Write = cachesim_D1_Write;
1570 }
1571 else {
1572 simulator.I1_Read = cachesim_I1_ref;
1573 simulator.D1_Read = cachesim_D1_ref;
1574 simulator.D1_Write = cachesim_D1_ref;
1575 }
1576}
1577
1578
1579/* Clear simulator state. Has to be initialized before */
1580static
1581void cachesim_clear(void)
1582{
1583 cachesim_clearcache(&I1);
1584 cachesim_clearcache(&D1);
1585 cachesim_clearcache(&L2);
1586
1587 prefetch_clear();
1588}
1589
1590
1591static void cachesim_getdesc(Char* buf)
1592{
1593 Int p;
1594 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1595 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1596 VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1597}
1598
1599static
1600void cachesim_print_opts(void)
1601{
1602 VG_(printf)(
1603"\n cache simulator options:\n"
1604" --simulate-cache=no|yes Do cache simulation [no]\n"
1605" --simulate-wb=no|yes Count write-back events [no]\n"
1606" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1607#if CLG_EXPERIMENTAL
1608" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1609#endif
1610" --cacheuse=no|yes Collect cache block use [no]\n"
1611" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
1612" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
1613" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
1614 );
1615}
1616
1617static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
1618{
1619 int i1, i2, i3;
1620 int i;
sewardj9c606bd2008-09-18 18:12:50 +00001621 char *opt = VG_(strdup)("cl.sim.po.1", orig_opt);
weidendoa17f2a32006-03-20 10:27:30 +00001622
1623 i = i1 = opt_len;
1624
1625 /* Option looks like "--I1=65536,2,64".
1626 * Find commas, replace with NULs to make three independent
1627 * strings, then extract numbers. Yuck. */
1628 while (VG_(isdigit)(opt[i])) i++;
1629 if (',' == opt[i]) {
1630 opt[i++] = '\0';
1631 i2 = i;
1632 } else goto bad;
1633 while (VG_(isdigit)(opt[i])) i++;
1634 if (',' == opt[i]) {
1635 opt[i++] = '\0';
1636 i3 = i;
1637 } else goto bad;
1638 while (VG_(isdigit)(opt[i])) i++;
1639 if ('\0' != opt[i]) goto bad;
1640
1641 cache->size = (Int)VG_(atoll)(opt + i1);
1642 cache->assoc = (Int)VG_(atoll)(opt + i2);
1643 cache->line_size = (Int)VG_(atoll)(opt + i3);
1644
1645 VG_(free)(opt);
1646
1647 return;
1648
1649 bad:
sewardj6893d652006-10-15 01:25:13 +00001650 VG_(err_bad_option)(orig_opt);
weidendoa17f2a32006-03-20 10:27:30 +00001651}
1652
1653/* Check for command line option for cache configuration.
1654 * Return False if unknown and not handled.
1655 *
1656 * Called from CLG_(process_cmd_line_option)() in clo.c
1657 */
1658static Bool cachesim_parse_opt(Char* arg)
1659{
1660 if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
1661 clo_simulate_writeback = True;
1662 else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
1663 clo_simulate_writeback = False;
1664
1665 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
1666 clo_simulate_hwpref = True;
1667 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
1668 clo_simulate_hwpref = False;
1669
1670 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
1671 clo_simulate_sectors = True;
1672 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
1673 clo_simulate_sectors = False;
1674
1675 else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
1676 clo_collect_cacheuse = True;
1677 /* Use counters only make sense with fine dumping */
1678 CLG_(clo).dump_instr = True;
1679 }
1680 else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
1681 clo_collect_cacheuse = False;
1682
1683 /* 5 is length of "--I1=" */
1684 else if (0 == VG_(strncmp)(arg, "--I1=", 5))
1685 parse_opt(&clo_I1_cache, arg, 5);
1686 else if (0 == VG_(strncmp)(arg, "--D1=", 5))
1687 parse_opt(&clo_D1_cache, arg, 5);
1688 else if (0 == VG_(strncmp)(arg, "--L2=", 5))
1689 parse_opt(&clo_L2_cache, arg, 5);
1690 else
1691 return False;
1692
1693 return True;
1694}
1695
1696/* Adds commas to ULong, right justifying in a field field_width wide, returns
1697 * the string in buf. */
1698static
1699Int commify(ULong n, int field_width, char* buf)
1700{
1701 int len, n_commas, i, j, new_len, space;
1702
1703 VG_(sprintf)(buf, "%llu", n);
1704 len = VG_(strlen)(buf);
1705 n_commas = (len - 1) / 3;
1706 new_len = len + n_commas;
1707 space = field_width - new_len;
1708
1709 /* Allow for printing a number in a field_width smaller than it's size */
1710 if (space < 0) space = 0;
1711
1712 /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1713 * of three. */
1714 for (j = -1, i = len ; i >= 0; i--) {
1715 buf[i + n_commas + space] = buf[i];
1716
1717 if ((i>0) && (3 == ++j)) {
1718 j = 0;
1719 n_commas--;
1720 buf[i + n_commas + space] = ',';
1721 }
1722 }
1723 /* Right justify in field. */
1724 for (i = 0; i < space; i++) buf[i] = ' ';
1725 return new_len;
1726}
1727
1728static
1729void percentify(Int n, Int ex, Int field_width, char buf[])
1730{
1731 int i, len, space;
1732
1733 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1734 len = VG_(strlen)(buf);
1735 space = field_width - len;
1736 if (space < 0) space = 0; /* Allow for v. small field_width */
1737 i = len;
1738
1739 /* Right justify in field */
1740 for ( ; i >= 0; i--) buf[i + space] = buf[i];
1741 for (i = 0; i < space; i++) buf[i] = ' ';
1742}
1743
1744static
1745void cachesim_printstat(void)
1746{
1747 FullCost total = CLG_(total_cost), D_total = 0;
1748 ULong L2_total_m, L2_total_mr, L2_total_mw,
1749 L2_total, L2_total_r, L2_total_w;
1750 char buf1[RESULTS_BUF_LEN],
1751 buf2[RESULTS_BUF_LEN],
1752 buf3[RESULTS_BUF_LEN];
1753 Int l1, l2, l3;
1754 Int p;
1755
1756 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1757 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
1758 prefetch_up);
1759 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
1760 prefetch_down);
1761 VG_(message)(Vg_DebugMsg, "");
1762 }
1763
1764 /* I cache results. Use the I_refs value to determine the first column
1765 * width. */
1766 l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1767 VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
1768
1769 if (!CLG_(clo).simulate_cache) return;
1770
1771 commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1772 VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
1773
1774 commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1775 VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
1776
1777 p = 100;
1778
1779 if (0 == total[CLG_(sets).off_full_Ir])
1780 total[CLG_(sets).off_full_Ir] = 1;
1781
1782 percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1783 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1784 VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
1785
1786 percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1787 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1788 VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
1789 VG_(message)(Vg_UserMsg, "");
1790
1791 /* D cache results.
1792 Use the D_refs.rd and D_refs.wr values to determine the
1793 * width of columns 2 & 3. */
1794
1795 D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1796 CLG_(init_cost)( CLG_(sets).full, D_total);
1797 CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1798 CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1799
1800 commify( D_total[0], l1, buf1);
1801 l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
1802 l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
1803 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
1804 buf1, buf2, buf3);
1805
1806 commify( D_total[1], l1, buf1);
1807 commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1808 commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1809 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
1810 buf1, buf2, buf3);
1811
1812 commify( D_total[2], l1, buf1);
1813 commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1814 commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1815 VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
1816 buf1, buf2, buf3);
1817
1818 p = 10;
1819
1820 if (0 == D_total[0]) D_total[0] = 1;
1821 if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1822 if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1823
1824 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
1825 percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1826 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1827 percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1828 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1829 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
1830
1831 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
1832 percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1833 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1834 percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1835 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1836 VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
1837 VG_(message)(Vg_UserMsg, "");
1838
1839
1840
1841 /* L2 overall results */
1842
1843 L2_total =
1844 total[CLG_(sets).off_full_Dr +1] +
1845 total[CLG_(sets).off_full_Dw +1] +
1846 total[CLG_(sets).off_full_Ir +1];
1847 L2_total_r =
1848 total[CLG_(sets).off_full_Dr +1] +
1849 total[CLG_(sets).off_full_Ir +1];
1850 L2_total_w = total[CLG_(sets).off_full_Dw +1];
1851 commify(L2_total, l1, buf1);
1852 commify(L2_total_r, l2, buf2);
1853 commify(L2_total_w, l3, buf3);
1854 VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
1855 buf1, buf2, buf3);
1856
1857 L2_total_m =
1858 total[CLG_(sets).off_full_Dr +2] +
1859 total[CLG_(sets).off_full_Dw +2] +
1860 total[CLG_(sets).off_full_Ir +2];
1861 L2_total_mr =
1862 total[CLG_(sets).off_full_Dr +2] +
1863 total[CLG_(sets).off_full_Ir +2];
1864 L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1865 commify(L2_total_m, l1, buf1);
1866 commify(L2_total_mr, l2, buf2);
1867 commify(L2_total_mw, l3, buf3);
1868 VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
1869 buf1, buf2, buf3);
1870
1871 percentify(L2_total_m * 100 * p /
1872 (total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
1873 percentify(L2_total_mr * 100 * p /
1874 (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1875 p, l2+1, buf2);
1876 percentify(L2_total_mw * 100 * p /
1877 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1878 VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
1879 buf1, buf2,buf3);
1880}
1881
1882
1883/*------------------------------------------------------------*/
1884/*--- Setup for Event set. ---*/
1885/*------------------------------------------------------------*/
1886
1887struct event_sets CLG_(sets);
1888
1889void CLG_(init_eventsets)(Int max_user)
1890{
1891 EventType * e1, *e2, *e3, *e4;
1892 EventSet *Ir, *Dr, *Dw;
1893 EventSet *D0, *D1r, *D1w, *D2;
1894 EventSet *sim, *full;
1895 EventSet *use;
1896 int sizeOfUseIr;
1897
1898 use = CLG_(get_eventset)("Use", 4);
1899 if (clo_collect_cacheuse) {
1900 /* if TUse is 0, there was never a load, and no loss, too */
1901 e1 = CLG_(register_eventtype)("AcCost1");
1902 CLG_(add_eventtype)(use, e1);
1903 e1 = CLG_(register_eventtype)("SpLoss1");
1904 CLG_(add_eventtype)(use, e1);
1905 e1 = CLG_(register_eventtype)("AcCost2");
1906 CLG_(add_eventtype)(use, e1);
1907 e1 = CLG_(register_eventtype)("SpLoss2");
1908 CLG_(add_eventtype)(use, e1);
1909 }
1910
1911 Ir = CLG_(get_eventset)("Ir", 4);
1912 Dr = CLG_(get_eventset)("Dr", 4);
1913 Dw = CLG_(get_eventset)("Dw", 4);
1914 if (CLG_(clo).simulate_cache) {
1915 e1 = CLG_(register_eventtype)("Ir");
1916 e2 = CLG_(register_eventtype)("I1mr");
1917 e3 = CLG_(register_eventtype)("I2mr");
1918 if (clo_simulate_writeback) {
1919 e4 = CLG_(register_eventtype)("I2dmr");
1920 CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1921 }
1922 else
1923 CLG_(add_dep_event3)(Ir, e1,e2,e3);
1924
1925 e1 = CLG_(register_eventtype)("Dr");
1926 e2 = CLG_(register_eventtype)("D1mr");
1927 e3 = CLG_(register_eventtype)("D2mr");
1928 if (clo_simulate_writeback) {
1929 e4 = CLG_(register_eventtype)("D2dmr");
1930 CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1931 }
1932 else
1933 CLG_(add_dep_event3)(Dr, e1,e2,e3);
1934
1935 e1 = CLG_(register_eventtype)("Dw");
1936 e2 = CLG_(register_eventtype)("D1mw");
1937 e3 = CLG_(register_eventtype)("D2mw");
1938 if (clo_simulate_writeback) {
1939 e4 = CLG_(register_eventtype)("D2dmw");
1940 CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1941 }
1942 else
1943 CLG_(add_dep_event3)(Dw, e1,e2,e3);
1944
1945 }
1946 else {
1947 e1 = CLG_(register_eventtype)("Ir");
1948 CLG_(add_eventtype)(Ir, e1);
1949 }
1950
1951 sizeOfUseIr = use->size + Ir->size;
1952 D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
1953 CLG_(add_eventset)(D0, use);
1954 off_D0_Ir = CLG_(add_eventset)(D0, Ir);
1955
1956 D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
1957 CLG_(add_eventset)(D1r, use);
1958 off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
1959 off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
1960
1961 D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
1962 CLG_(add_eventset)(D1w, use);
1963 off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
1964 off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
1965
1966 D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
1967 CLG_(add_eventset)(D2, use);
1968 off_D2_Ir = CLG_(add_eventset)(D2, Ir);
1969 off_D2_Dr = CLG_(add_eventset)(D2, Dr);
1970 off_D2_Dw = CLG_(add_eventset)(D2, Dw);
1971
1972 sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
1973 CLG_(add_eventset)(sim, use);
1974 CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
1975 CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
1976 CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
1977
1978 if (CLG_(clo).collect_alloc) max_user += 2;
1979 if (CLG_(clo).collect_systime) max_user += 2;
1980
1981 full = CLG_(get_eventset)("full", sim->size + max_user);
1982 CLG_(add_eventset)(full, sim);
1983 CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
1984 CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
1985 CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
1986
1987 CLG_(sets).use = use;
1988 CLG_(sets).Ir = Ir;
1989 CLG_(sets).Dr = Dr;
1990 CLG_(sets).Dw = Dw;
1991
1992 CLG_(sets).D0 = D0;
1993 CLG_(sets).D1r = D1r;
1994 CLG_(sets).D1w = D1w;
1995 CLG_(sets).D2 = D2;
1996
1997 CLG_(sets).sim = sim;
1998 CLG_(sets).full = full;
1999
2000 if (CLG_(clo).collect_alloc) {
2001 e1 = CLG_(register_eventtype)("allocCount");
2002 e2 = CLG_(register_eventtype)("allocSize");
2003 CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
2004 }
2005
2006 if (CLG_(clo).collect_systime) {
2007 e1 = CLG_(register_eventtype)("sysCount");
2008 e2 = CLG_(register_eventtype)("sysTime");
2009 CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
2010 }
2011
2012 CLG_DEBUGIF(1) {
2013 CLG_DEBUG(1, "EventSets:\n");
2014 CLG_(print_eventset)(-2, use);
2015 CLG_(print_eventset)(-2, Ir);
2016 CLG_(print_eventset)(-2, Dr);
2017 CLG_(print_eventset)(-2, Dw);
2018 CLG_(print_eventset)(-2, sim);
2019 CLG_(print_eventset)(-2, full);
2020 }
2021
2022 /* Not-existing events are silently ignored */
2023 CLG_(dumpmap) = CLG_(get_eventmapping)(full);
2024 CLG_(append_event)(CLG_(dumpmap), "Ir");
2025 CLG_(append_event)(CLG_(dumpmap), "Dr");
2026 CLG_(append_event)(CLG_(dumpmap), "Dw");
2027 CLG_(append_event)(CLG_(dumpmap), "I1mr");
2028 CLG_(append_event)(CLG_(dumpmap), "D1mr");
2029 CLG_(append_event)(CLG_(dumpmap), "D1mw");
2030 CLG_(append_event)(CLG_(dumpmap), "I2mr");
2031 CLG_(append_event)(CLG_(dumpmap), "D2mr");
2032 CLG_(append_event)(CLG_(dumpmap), "D2mw");
2033 CLG_(append_event)(CLG_(dumpmap), "I2dmr");
2034 CLG_(append_event)(CLG_(dumpmap), "D2dmr");
2035 CLG_(append_event)(CLG_(dumpmap), "D2dmw");
2036 CLG_(append_event)(CLG_(dumpmap), "AcCost1");
2037 CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
2038 CLG_(append_event)(CLG_(dumpmap), "AcCost2");
2039 CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
2040 CLG_(append_event)(CLG_(dumpmap), "allocCount");
2041 CLG_(append_event)(CLG_(dumpmap), "allocSize");
2042 CLG_(append_event)(CLG_(dumpmap), "sysCount");
2043 CLG_(append_event)(CLG_(dumpmap), "sysTime");
2044
2045}
2046
2047
2048
2049static
2050void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
2051{
2052 /* if eventset use is defined, it is always first (hardcoded!) */
2053 CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
2054
2055 /* FIXME: This is hardcoded... */
2056 if (es == CLG_(sets).D0) {
2057 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2058 cost + off_D0_Ir);
2059 }
2060 else if (es == CLG_(sets).D1r) {
2061 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2062 cost + off_D1r_Ir);
2063 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2064 cost + off_D1r_Dr);
2065 }
2066 else if (es == CLG_(sets).D1w) {
2067 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2068 cost + off_D1w_Ir);
2069 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2070 cost + off_D1w_Dw);
2071 }
2072 else {
2073 CLG_ASSERT(es == CLG_(sets).D2);
2074 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
2075 cost + off_D2_Ir);
2076 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
2077 cost + off_D2_Dr);
2078 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
2079 cost + off_D2_Dw);
2080 }
2081}
2082
2083/* this is called at dump time for every instruction executed */
2084static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
2085 InstrInfo* ii, ULong exe_count)
2086{
2087 if (!CLG_(clo).simulate_cache)
2088 cost[CLG_(sets).off_sim_Ir] += exe_count;
2089 else {
2090
2091#if 0
2092/* There is always a trivial case where exe_count and Ir can be
2093 * slightly different because ecounter is updated when executing
2094 * the next BB. E.g. for last BB executed, or when toggling collection
2095 */
2096 /* FIXME: Hardcoded that each eventset has Ir as first */
2097 if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
2098 VG_(printf)("==> Ir %llu, exe %llu\n",
2099 (bbcc->cost + ii->cost_offset)[0], exe_count);
2100 CLG_(print_bbcc_cost)(-2, bbcc);
2101 //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
2102 }
2103#endif
2104
2105 add_and_zero_Dx(ii->eventset, cost,
2106 bbcc->cost + ii->cost_offset);
2107 }
2108}
2109
2110static
2111void cachesim_after_bbsetup(void)
2112{
2113 BBCC* bbcc = CLG_(current_state).bbcc;
2114
2115 if (CLG_(clo).simulate_cache) {
2116 BB* bb = bbcc->bb;
2117
2118 /* only needed if log_* functions are called */
2119 bb_base = bb->obj->offset + bb->offset;
2120 cost_base = bbcc->cost;
2121 }
2122}
2123
2124static
2125void cachesim_finish(void)
2126{
2127 if (clo_collect_cacheuse)
2128 cacheuse_finish();
2129}
2130
2131/*------------------------------------------------------------*/
2132/*--- The simulator defined in this file ---*/
2133/*------------------------------------------------------------*/
2134
2135struct cachesim_if CLG_(cachesim) = {
2136 .print_opts = cachesim_print_opts,
2137 .parse_opt = cachesim_parse_opt,
2138 .post_clo_init = cachesim_post_clo_init,
2139 .clear = cachesim_clear,
2140 .getdesc = cachesim_getdesc,
2141 .printstat = cachesim_printstat,
2142 .add_icost = cachesim_add_icost,
2143 .after_bbsetup = cachesim_after_bbsetup,
2144 .finish = cachesim_finish,
2145
2146 /* these will be set by cachesim_post_clo_init */
2147 .log_1I0D = 0,
2148
2149 .log_1I1Dr = 0,
2150 .log_1I1Dw = 0,
2151 .log_1I2D = 0,
2152
2153 .log_0I1Dr = 0,
2154 .log_0I1Dw = 0,
2155 .log_0I2D = 0,
2156
2157 .log_1I0D_name = "(no function)",
2158
2159 .log_1I1Dr_name = "(no function)",
2160 .log_1I1Dw_name = "(no function)",
2161 .log_1I2D_name = "(no function)",
2162
2163 .log_0I1Dr_name = "(no function)",
2164 .log_0I1Dw_name = "(no function)",
2165 .log_0I2D_name = "(no function)"
2166};
2167
2168
2169/*--------------------------------------------------------------------*/
2170/*--- end ct_sim.c ---*/
2171/*--------------------------------------------------------------------*/
2172