blob: 5208a725ee10ae0cc16d9e8925f664e9714cd254 [file] [log] [blame]
weidendoa17f2a32006-03-20 10:27:30 +00001
2/*--------------------------------------------------------------------*/
3/*--- Cache simulation. ---*/
4/*--- sim.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
njn9a0cba42007-04-15 22:15:57 +00008 This file is part of Callgrind, a Valgrind tool for call graph
9 profiling programs.
weidendoa17f2a32006-03-20 10:27:30 +000010
njn9a0cba42007-04-15 22:15:57 +000011 Copyright (C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
weidendoa17f2a32006-03-20 10:27:30 +000012
njn9a0cba42007-04-15 22:15:57 +000013 This tool is derived from and contains code from Cachegrind
sewardj4d474d02008-02-11 11:34:59 +000014 Copyright (C) 2002-2008 Nicholas Nethercote (njn@valgrind.org)
weidendoa17f2a32006-03-20 10:27:30 +000015
16 This program is free software; you can redistribute it and/or
17 modify it under the terms of the GNU General Public License as
18 published by the Free Software Foundation; either version 2 of the
19 License, or (at your option) any later version.
20
21 This program is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received a copy of the GNU General Public License
27 along with this program; if not, write to the Free Software
28 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29 02111-1307, USA.
30
31 The GNU General Public License is contained in the file COPYING.
32*/
33
34#include "global.h"
35
36
37/* Notes:
38 - simulates a write-allocate cache
39 - (block --> set) hash function uses simple bit selection
40 - handling of references straddling two cache blocks:
41 - counts as only one cache access (not two)
42 - both blocks hit --> one hit
43 - one block hits, the other misses --> one miss
44 - both blocks miss --> one miss (not two)
45*/
46
47/* Cache configuration */
48#include "cg_arch.h"
49
50/* additional structures for cache use info, separated
51 * according usage frequency:
52 * - line_loaded : pointer to cost center of instruction
53 * which loaded the line into cache.
54 * Needed to increment counters when line is evicted.
55 * - line_use : updated on every access
56 */
57typedef struct {
58 UInt count;
59 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
60} line_use;
61
62typedef struct {
63 Addr memline, iaddr;
64 line_use* dep_use; /* point to higher-level cacheblock for this memline */
65 ULong* use_base;
66} line_loaded;
67
68/* Cache state */
69typedef struct {
70 char* name;
71 int size; /* bytes */
72 int assoc;
73 int line_size; /* bytes */
74 Bool sectored; /* prefetch nearside cacheline on read */
75 int sets;
76 int sets_min_1;
weidendoa17f2a32006-03-20 10:27:30 +000077 int line_size_bits;
78 int tag_shift;
79 UWord tag_mask;
80 char desc_line[128];
81 UWord* tags;
82
83 /* for cache use */
84 int line_size_mask;
85 int* line_start_mask;
86 int* line_end_mask;
87 line_loaded* loaded;
88 line_use* use;
89} cache_t2;
90
91/*
92 * States of flat caches in our model.
93 * We use a 2-level hierarchy,
94 */
95static cache_t2 I1, D1, L2;
96
97/* Lower bits of cache tags are used as flags for a cache line */
98#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
99#define CACHELINE_DIRTY 1
100
101
102/* Cache simulator Options */
103static Bool clo_simulate_writeback = False;
104static Bool clo_simulate_hwpref = False;
105static Bool clo_simulate_sectors = False;
106static Bool clo_collect_cacheuse = False;
107
108/* Following global vars are setup before by
109 * setup_bbcc()/cachesim_after_bbsetup():
110 *
111 * - Addr bb_base (instruction start address of original BB)
112 * - ULong* cost_base (start of cost array for BB)
113 * - BBCC* nonskipped (only != 0 when in a function not skipped)
114 */
115
116/* Offset to events in event set, used in log_* functions */
117static Int off_D0_Ir;
118static Int off_D1r_Ir;
119static Int off_D1r_Dr;
120static Int off_D1w_Ir;
121static Int off_D1w_Dw;
122static Int off_D2_Ir;
123static Int off_D2_Dr;
124static Int off_D2_Dw;
125
126static Addr bb_base;
127static ULong* cost_base;
128static InstrInfo* current_ii;
129
130/* Cache use offsets */
131/* FIXME: The offsets are only correct because all eventsets get
132 * the "Use" set added first !
133 */
134static Int off_I1_AcCost = 0;
135static Int off_I1_SpLoss = 1;
136static Int off_D1_AcCost = 0;
137static Int off_D1_SpLoss = 1;
138static Int off_L2_AcCost = 2;
139static Int off_L2_SpLoss = 3;
140
141/* Cache access types */
142typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
143
144/* Result of a reference into a flat cache */
145typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
146
147/* Result of a reference into a hierarchical cache model */
148typedef enum {
149 L1_Hit,
150 L2_Hit,
151 MemAccess,
152 WriteBackMemAccess } CacheModelResult;
153
154typedef CacheModelResult (*simcall_type)(Addr, UChar);
155
156static struct {
157 simcall_type I1_Read;
158 simcall_type D1_Read;
159 simcall_type D1_Write;
160} simulator;
161
162/*------------------------------------------------------------*/
163/*--- Cache Simulator Initialization ---*/
164/*------------------------------------------------------------*/
165
166static void cachesim_clearcache(cache_t2* c)
167{
168 Int i;
169
170 for (i = 0; i < c->sets * c->assoc; i++)
171 c->tags[i] = 0;
172 if (c->use) {
173 for (i = 0; i < c->sets * c->assoc; i++) {
174 c->loaded[i].memline = 0;
175 c->loaded[i].use_base = 0;
176 c->loaded[i].dep_use = 0;
177 c->loaded[i].iaddr = 0;
178 c->use[i].mask = 0;
179 c->use[i].count = 0;
180 c->tags[i] = i % c->assoc; /* init lower bits as pointer */
181 }
182 }
183}
184
185static void cacheuse_initcache(cache_t2* c);
186
187/* By this point, the size/assoc/line_size has been checked. */
188static void cachesim_initcache(cache_t config, cache_t2* c)
189{
190 c->size = config.size;
191 c->assoc = config.assoc;
192 c->line_size = config.line_size;
193 c->sectored = False; // FIXME
194
195 c->sets = (c->size / c->line_size) / c->assoc;
196 c->sets_min_1 = c->sets - 1;
weidendoa17f2a32006-03-20 10:27:30 +0000197 c->line_size_bits = VG_(log2)(c->line_size);
198 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
199 c->tag_mask = ~((1<<c->tag_shift)-1);
200
201 /* Can bits in tag entries be used for flags?
202 * Should be always true as MIN_LINE_SIZE >= 16 */
203 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
204
205 if (c->assoc == 1) {
206 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
207 c->size, c->line_size,
208 c->sectored ? ", sectored":"");
209 } else {
210 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
211 c->size, c->line_size, c->assoc,
212 c->sectored ? ", sectored":"");
213 }
214
sewardj9c606bd2008-09-18 18:12:50 +0000215 c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
216 sizeof(UWord) * c->sets * c->assoc);
weidendoa17f2a32006-03-20 10:27:30 +0000217 if (clo_collect_cacheuse)
218 cacheuse_initcache(c);
219 else
220 c->use = 0;
221 cachesim_clearcache(c);
222}
223
224
225#if 0
226static void print_cache(cache_t2* c)
227{
228 UInt set, way, i;
229
230 /* Note initialisation and update of 'i'. */
231 for (i = 0, set = 0; set < c->sets; set++) {
232 for (way = 0; way < c->assoc; way++, i++) {
233 VG_(printf)("%8x ", c->tags[i]);
234 }
235 VG_(printf)("\n");
236 }
237}
238#endif
239
240
241/*------------------------------------------------------------*/
242/*--- Write Through Cache Simulation ---*/
243/*------------------------------------------------------------*/
244
245/*
246 * Simple model: L1 & L2 Write Through
247 * Does not distinguish among read and write references
248 *
249 * Simulator functions:
250 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
251 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
252 */
253
254static __inline__
255CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
256{
257 int i, j;
258 UWord *set;
259
weidendo144b76c2009-01-26 22:56:14 +0000260 set = &(c->tags[set_no * c->assoc]);
weidendoa17f2a32006-03-20 10:27:30 +0000261
262 /* This loop is unrolled for just the first case, which is the most */
263 /* common. We can't unroll any further because it would screw up */
264 /* if we have a direct-mapped (1-way) cache. */
265 if (tag == set[0])
266 return Hit;
267
268 /* If the tag is one other than the MRU, move it into the MRU spot */
269 /* and shuffle the rest down. */
270 for (i = 1; i < c->assoc; i++) {
271 if (tag == set[i]) {
272 for (j = i; j > 0; j--) {
273 set[j] = set[j - 1];
274 }
275 set[0] = tag;
276 return Hit;
277 }
278 }
279
280 /* A miss; install this tag as MRU, shuffle rest down. */
281 for (j = c->assoc - 1; j > 0; j--) {
282 set[j] = set[j - 1];
283 }
284 set[0] = tag;
285
286 return Miss;
287}
288
289static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
290{
291 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
292 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
293 UWord tag = a >> c->tag_shift;
294
295 /* Access entirely within line. */
296 if (set1 == set2)
297 return cachesim_setref(c, set1, tag);
298
299 /* Access straddles two lines. */
300 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
301 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo28e2a142006-11-22 21:00:53 +0000302 UWord tag2 = (a+size-1) >> c->tag_shift;
weidendoa17f2a32006-03-20 10:27:30 +0000303
304 /* the call updates cache structures as side effect */
305 CacheResult res1 = cachesim_setref(c, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000306 CacheResult res2 = cachesim_setref(c, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000307 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
308
309 } else {
njn8a7b41b2007-09-23 00:51:24 +0000310 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendoa17f2a32006-03-20 10:27:30 +0000311 VG_(tool_panic)("item straddles more than two cache sets");
312 }
313 return Hit;
314}
315
316static
317CacheModelResult cachesim_I1_ref(Addr a, UChar size)
318{
319 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
320 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
321 return MemAccess;
322}
323
324static
325CacheModelResult cachesim_D1_ref(Addr a, UChar size)
326{
327 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
328 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
329 return MemAccess;
330}
331
332
333/*------------------------------------------------------------*/
334/*--- Write Back Cache Simulation ---*/
335/*------------------------------------------------------------*/
336
337/*
338 * More complex model: L1 Write-through, L2 Write-back
339 * This needs to distinguish among read and write references.
340 *
341 * Simulator functions:
342 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
343 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
344 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
345 */
346
347/*
348 * With write-back, result can be a miss evicting a dirty line
349 * The dirty state of a cache line is stored in Bit0 of the tag for
350 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
351 * type (Read/Write), the line gets dirty on a write.
352 */
353static __inline__
354CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
355{
356 int i, j;
357 UWord *set, tmp_tag;
358
weidendo144b76c2009-01-26 22:56:14 +0000359 set = &(c->tags[set_no * c->assoc]);
weidendoa17f2a32006-03-20 10:27:30 +0000360
361 /* This loop is unrolled for just the first case, which is the most */
362 /* common. We can't unroll any further because it would screw up */
363 /* if we have a direct-mapped (1-way) cache. */
364 if (tag == (set[0] & ~CACHELINE_DIRTY)) {
365 set[0] |= ref;
366 return Hit;
367 }
368 /* If the tag is one other than the MRU, move it into the MRU spot */
369 /* and shuffle the rest down. */
370 for (i = 1; i < c->assoc; i++) {
371 if (tag == (set[i] & ~CACHELINE_DIRTY)) {
372 tmp_tag = set[i] | ref; // update dirty flag
373 for (j = i; j > 0; j--) {
374 set[j] = set[j - 1];
375 }
376 set[0] = tmp_tag;
377 return Hit;
378 }
379 }
380
381 /* A miss; install this tag as MRU, shuffle rest down. */
382 tmp_tag = set[c->assoc - 1];
383 for (j = c->assoc - 1; j > 0; j--) {
384 set[j] = set[j - 1];
385 }
386 set[0] = tag | ref;
387
388 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
389}
390
391
392static __inline__
393CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
394{
395 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
396 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
397 UWord tag = a & c->tag_mask;
398
399 /* Access entirely within line. */
400 if (set1 == set2)
401 return cachesim_setref_wb(c, ref, set1, tag);
402
403 /* Access straddles two lines. */
404 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
405 else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo144b76c2009-01-26 22:56:14 +0000406 UWord tag2 = (a+size-1) & c->tag_mask;
weidendoa17f2a32006-03-20 10:27:30 +0000407
408 /* the call updates cache structures as side effect */
409 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
weidendo28e2a142006-11-22 21:00:53 +0000410 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
weidendoa17f2a32006-03-20 10:27:30 +0000411
412 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
413 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
414
415 } else {
njn8a7b41b2007-09-23 00:51:24 +0000416 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendoa17f2a32006-03-20 10:27:30 +0000417 VG_(tool_panic)("item straddles more than two cache sets");
418 }
419 return Hit;
420}
421
422
423static
424CacheModelResult cachesim_I1_Read(Addr a, UChar size)
425{
426 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
427 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
428 case Hit: return L2_Hit;
429 case Miss: return MemAccess;
430 default: break;
431 }
432 return WriteBackMemAccess;
433}
434
435static
436CacheModelResult cachesim_D1_Read(Addr a, UChar size)
437{
438 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
439 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
440 case Hit: return L2_Hit;
441 case Miss: return MemAccess;
442 default: break;
443 }
444 return WriteBackMemAccess;
445}
446
447static
448CacheModelResult cachesim_D1_Write(Addr a, UChar size)
449{
450 if ( cachesim_ref( &D1, a, size) == Hit ) {
451 /* Even for a L1 hit, the write-trough L1 passes
452 * the write to the L2 to make the L2 line dirty.
453 * But this causes no latency, so return the hit.
454 */
455 cachesim_ref_wb( &L2, Write, a, size);
456 return L1_Hit;
457 }
458 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
459 case Hit: return L2_Hit;
460 case Miss: return MemAccess;
461 default: break;
462 }
463 return WriteBackMemAccess;
464}
465
466
467/*------------------------------------------------------------*/
468/*--- Hardware Prefetch Simulation ---*/
469/*------------------------------------------------------------*/
470
471static ULong prefetch_up = 0;
472static ULong prefetch_down = 0;
473
474#define PF_STREAMS 8
475#define PF_PAGEBITS 12
476
477static UInt pf_lastblock[PF_STREAMS];
478static Int pf_seqblocks[PF_STREAMS];
479
480static
481void prefetch_clear(void)
482{
483 int i;
484 for(i=0;i<PF_STREAMS;i++)
485 pf_lastblock[i] = pf_seqblocks[i] = 0;
486}
487
488/*
489 * HW Prefetch emulation
490 * Start prefetching when detecting sequential access to 3 memory blocks.
491 * One stream can be detected per 4k page.
492 */
493static __inline__
weidendo09ee78e2009-02-24 12:26:53 +0000494void prefetch_L2_doref(Addr a)
weidendoa17f2a32006-03-20 10:27:30 +0000495{
496 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
497 UInt block = ( a >> L2.line_size_bits);
498
499 if (block != pf_lastblock[stream]) {
500 if (pf_seqblocks[stream] == 0) {
501 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
502 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
503 }
504 else if (pf_seqblocks[stream] >0) {
505 if (pf_lastblock[stream] +1 == block) {
506 pf_seqblocks[stream]++;
507 if (pf_seqblocks[stream] >= 2) {
508 prefetch_up++;
509 cachesim_ref(&L2, a + 5 * L2.line_size,1);
510 }
511 }
512 else pf_seqblocks[stream] = 0;
513 }
514 else if (pf_seqblocks[stream] <0) {
515 if (pf_lastblock[stream] -1 == block) {
516 pf_seqblocks[stream]--;
517 if (pf_seqblocks[stream] <= -2) {
518 prefetch_down++;
519 cachesim_ref(&L2, a - 5 * L2.line_size,1);
520 }
521 }
522 else pf_seqblocks[stream] = 0;
523 }
524 pf_lastblock[stream] = block;
525 }
526}
527
528/* simple model with hardware prefetch */
529
530static
531CacheModelResult prefetch_I1_ref(Addr a, UChar size)
532{
533 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
weidendo09ee78e2009-02-24 12:26:53 +0000534 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000535 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
536 return MemAccess;
537}
538
539static
540CacheModelResult prefetch_D1_ref(Addr a, UChar size)
541{
542 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
weidendo09ee78e2009-02-24 12:26:53 +0000543 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000544 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
545 return MemAccess;
546}
547
548
549/* complex model with hardware prefetch */
550
551static
552CacheModelResult prefetch_I1_Read(Addr a, UChar size)
553{
554 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
weidendo09ee78e2009-02-24 12:26:53 +0000555 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000556 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
557 case Hit: return L2_Hit;
558 case Miss: return MemAccess;
559 default: break;
560 }
561 return WriteBackMemAccess;
562}
563
564static
565CacheModelResult prefetch_D1_Read(Addr a, UChar size)
566{
567 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
weidendo09ee78e2009-02-24 12:26:53 +0000568 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000569 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
570 case Hit: return L2_Hit;
571 case Miss: return MemAccess;
572 default: break;
573 }
574 return WriteBackMemAccess;
575}
576
577static
578CacheModelResult prefetch_D1_Write(Addr a, UChar size)
579{
weidendo09ee78e2009-02-24 12:26:53 +0000580 prefetch_L2_doref(a);
weidendoa17f2a32006-03-20 10:27:30 +0000581 if ( cachesim_ref( &D1, a, size) == Hit ) {
582 /* Even for a L1 hit, the write-trough L1 passes
583 * the write to the L2 to make the L2 line dirty.
584 * But this causes no latency, so return the hit.
585 */
586 cachesim_ref_wb( &L2, Write, a, size);
587 return L1_Hit;
588 }
589 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
590 case Hit: return L2_Hit;
591 case Miss: return MemAccess;
592 default: break;
593 }
594 return WriteBackMemAccess;
595}
596
597
598/*------------------------------------------------------------*/
599/*--- Cache Simulation with use metric collection ---*/
600/*------------------------------------------------------------*/
601
602/* can not be combined with write-back or prefetch */
603
604static
605void cacheuse_initcache(cache_t2* c)
606{
607 int i;
608 unsigned int start_mask, start_val;
609 unsigned int end_mask, end_val;
610
sewardj9c606bd2008-09-18 18:12:50 +0000611 c->use = CLG_MALLOC("cl.sim.cu_ic.1",
612 sizeof(line_use) * c->sets * c->assoc);
613 c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
614 sizeof(line_loaded) * c->sets * c->assoc);
615 c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
616 sizeof(int) * c->line_size);
617 c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
618 sizeof(int) * c->line_size);
weidendoa17f2a32006-03-20 10:27:30 +0000619
weidendoa17f2a32006-03-20 10:27:30 +0000620 c->line_size_mask = c->line_size-1;
621
622 /* Meaning of line_start_mask/line_end_mask
623 * Example: for a given cache line, you get an access starting at
624 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
625 * line size of 32, you have 1 bit per byte in the mask:
626 *
627 * bit31 bit8 bit5 bit 0
628 * | | | |
629 * 11..111111100000 line_start_mask[5]
630 * 00..000111111111 line_end_mask[(5+4)-1]
631 *
632 * use_mask |= line_start_mask[5] && line_end_mask[8]
633 *
634 */
635 start_val = end_val = ~0;
636 if (c->line_size < 32) {
637 int bits_per_byte = 32/c->line_size;
638 start_mask = (1<<bits_per_byte)-1;
639 end_mask = start_mask << (32-bits_per_byte);
640 for(i=0;i<c->line_size;i++) {
641 c->line_start_mask[i] = start_val;
642 start_val = start_val & ~start_mask;
643 start_mask = start_mask << bits_per_byte;
644
645 c->line_end_mask[c->line_size-i-1] = end_val;
646 end_val = end_val & ~end_mask;
647 end_mask = end_mask >> bits_per_byte;
648 }
649 }
650 else {
651 int bytes_per_bit = c->line_size/32;
652 start_mask = 1;
653 end_mask = 1 << 31;
654 for(i=0;i<c->line_size;i++) {
655 c->line_start_mask[i] = start_val;
656 c->line_end_mask[c->line_size-i-1] = end_val;
657 if ( ((i+1)%bytes_per_bit) == 0) {
658 start_val &= ~start_mask;
659 end_val &= ~end_mask;
660 start_mask <<= 1;
661 end_mask >>= 1;
662 }
663 }
664 }
665
666 CLG_DEBUG(6, "Config %s:\n", c->desc_line);
667 for(i=0;i<c->line_size;i++) {
668 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
669 i, c->line_start_mask[i], c->line_end_mask[i]);
670 }
671
672 /* We use lower tag bits as offset pointers to cache use info.
673 * I.e. some cache parameters don't work.
674 */
weidendo144b76c2009-01-26 22:56:14 +0000675 if ( (1<<c->tag_shift) < c->assoc) {
weidendoa17f2a32006-03-20 10:27:30 +0000676 VG_(message)(Vg_DebugMsg,
677 "error: Use associativity < %d for cache use statistics!",
678 (1<<c->tag_shift) );
679 VG_(tool_panic)("Unsupported cache configuration");
680 }
681}
682
weidendoa17f2a32006-03-20 10:27:30 +0000683
684/* for I1/D1 caches */
685#define CACHEUSE(L) \
686 \
687static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
688{ \
weidendo28e2a142006-11-22 21:00:53 +0000689 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
690 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
691 UWord tag = a & L.tag_mask; \
692 UWord tag2; \
weidendoa17f2a32006-03-20 10:27:30 +0000693 int i, j, idx; \
694 UWord *set, tmp_tag; \
695 UInt use_mask; \
696 \
barta0b6b2c2008-07-07 06:49:24 +0000697 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \
weidendoa17f2a32006-03-20 10:27:30 +0000698 L.name, a, size, set1, set2); \
699 \
700 /* First case: word entirely within line. */ \
701 if (set1 == set2) { \
702 \
weidendo144b76c2009-01-26 22:56:14 +0000703 set = &(L.tags[set1 * L.assoc]); \
weidendoa17f2a32006-03-20 10:27:30 +0000704 use_mask = L.line_start_mask[a & L.line_size_mask] & \
705 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
706 \
707 /* This loop is unrolled for just the first case, which is the most */\
708 /* common. We can't unroll any further because it would screw up */\
709 /* if we have a direct-mapped (1-way) cache. */\
710 if (tag == (set[0] & L.tag_mask)) { \
weidendo144b76c2009-01-26 22:56:14 +0000711 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000712 L.use[idx].count ++; \
713 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000714 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000715 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
716 use_mask, L.use[idx].mask, L.use[idx].count); \
717 return L1_Hit; \
718 } \
719 /* If the tag is one other than the MRU, move it into the MRU spot */\
720 /* and shuffle the rest down. */\
721 for (i = 1; i < L.assoc; i++) { \
722 if (tag == (set[i] & L.tag_mask)) { \
723 tmp_tag = set[i]; \
724 for (j = i; j > 0; j--) { \
725 set[j] = set[j - 1]; \
726 } \
727 set[0] = tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000728 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000729 L.use[idx].count ++; \
730 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000731 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000732 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
733 use_mask, L.use[idx].mask, L.use[idx].count); \
734 return L1_Hit; \
735 } \
736 } \
737 \
738 /* A miss; install this tag as MRU, shuffle rest down. */ \
739 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
740 for (j = L.assoc - 1; j > 0; j--) { \
741 set[j] = set[j - 1]; \
742 } \
743 set[0] = tag | tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000744 idx = (set1 * L.assoc) + tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000745 return update_##L##_use(&L, idx, \
746 use_mask, a &~ L.line_size_mask); \
747 \
748 /* Second case: word straddles two lines. */ \
749 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
750 } else if (((set1 + 1) & (L.sets-1)) == set2) { \
751 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
weidendo144b76c2009-01-26 22:56:14 +0000752 set = &(L.tags[set1 * L.assoc]); \
weidendoa17f2a32006-03-20 10:27:30 +0000753 use_mask = L.line_start_mask[a & L.line_size_mask]; \
754 if (tag == (set[0] & L.tag_mask)) { \
weidendo144b76c2009-01-26 22:56:14 +0000755 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000756 L.use[idx].count ++; \
757 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000758 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000759 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
760 use_mask, L.use[idx].mask, L.use[idx].count); \
761 goto block2; \
762 } \
763 for (i = 1; i < L.assoc; i++) { \
764 if (tag == (set[i] & L.tag_mask)) { \
765 tmp_tag = set[i]; \
766 for (j = i; j > 0; j--) { \
767 set[j] = set[j - 1]; \
768 } \
769 set[0] = tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000770 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000771 L.use[idx].count ++; \
772 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000773 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000774 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
775 use_mask, L.use[idx].mask, L.use[idx].count); \
776 goto block2; \
777 } \
778 } \
779 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
780 for (j = L.assoc - 1; j > 0; j--) { \
781 set[j] = set[j - 1]; \
782 } \
783 set[0] = tag | tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000784 idx = (set1 * L.assoc) + tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000785 miss1 = update_##L##_use(&L, idx, \
786 use_mask, a &~ L.line_size_mask); \
787block2: \
weidendo144b76c2009-01-26 22:56:14 +0000788 set = &(L.tags[set2 * L.assoc]); \
weidendoa17f2a32006-03-20 10:27:30 +0000789 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
weidendo28e2a142006-11-22 21:00:53 +0000790 tag2 = (a+size-1) & L.tag_mask; \
791 if (tag2 == (set[0] & L.tag_mask)) { \
weidendo144b76c2009-01-26 22:56:14 +0000792 idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000793 L.use[idx].count ++; \
794 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000795 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000796 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
797 use_mask, L.use[idx].mask, L.use[idx].count); \
798 return miss1; \
799 } \
800 for (i = 1; i < L.assoc; i++) { \
weidendo28e2a142006-11-22 21:00:53 +0000801 if (tag2 == (set[i] & L.tag_mask)) { \
weidendoa17f2a32006-03-20 10:27:30 +0000802 tmp_tag = set[i]; \
803 for (j = i; j > 0; j--) { \
804 set[j] = set[j - 1]; \
805 } \
806 set[0] = tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000807 idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
weidendoa17f2a32006-03-20 10:27:30 +0000808 L.use[idx].count ++; \
809 L.use[idx].mask |= use_mask; \
barta0b6b2c2008-07-07 06:49:24 +0000810 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000811 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
812 use_mask, L.use[idx].mask, L.use[idx].count); \
813 return miss1; \
814 } \
815 } \
816 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
817 for (j = L.assoc - 1; j > 0; j--) { \
818 set[j] = set[j - 1]; \
819 } \
weidendo28e2a142006-11-22 21:00:53 +0000820 set[0] = tag2 | tmp_tag; \
weidendo144b76c2009-01-26 22:56:14 +0000821 idx = (set2 * L.assoc) + tmp_tag; \
weidendoa17f2a32006-03-20 10:27:30 +0000822 miss2 = update_##L##_use(&L, idx, \
823 use_mask, (a+size-1) &~ L.line_size_mask); \
824 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
825 \
826 } else { \
barta0b6b2c2008-07-07 06:49:24 +0000827 VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
weidendoa17f2a32006-03-20 10:27:30 +0000828 VG_(tool_panic)("item straddles more than two cache sets"); \
829 } \
830 return 0; \
831}
832
833
834/* logarithmic bitcounting algorithm, see
835 * http://graphics.stanford.edu/~seander/bithacks.html
836 */
837static __inline__ unsigned int countBits(unsigned int bits)
838{
839 unsigned int c; // store the total here
840 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
841 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
842
843 c = bits;
844 c = ((c >> S[0]) & B[0]) + (c & B[0]);
845 c = ((c >> S[1]) & B[1]) + (c & B[1]);
846 c = ((c >> S[2]) & B[2]) + (c & B[2]);
847 c = ((c >> S[3]) & B[3]) + (c & B[3]);
848 c = ((c >> S[4]) & B[4]) + (c & B[4]);
849 return c;
850}
851
852static void update_L2_use(int idx, Addr memline)
853{
854 line_loaded* loaded = &(L2.loaded[idx]);
855 line_use* use = &(L2.use[idx]);
856 int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
857
barta0b6b2c2008-07-07 06:49:24 +0000858 CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
weidendoa17f2a32006-03-20 10:27:30 +0000859 idx, bb_base + current_ii->instr_offset, memline);
860 if (use->count>0) {
barta0b6b2c2008-07-07 06:49:24 +0000861 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
weidendoa17f2a32006-03-20 10:27:30 +0000862 use->count, i, use->mask, loaded->memline, loaded->iaddr);
863 CLG_DEBUG(2, " collect: %d, use_base %p\n",
864 CLG_(current_state).collect, loaded->use_base);
865
866 if (CLG_(current_state).collect && loaded->use_base) {
867 (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
868 (loaded->use_base)[off_L2_SpLoss] += i;
869 }
870 }
871
872 use->count = 0;
873 use->mask = 0;
874
875 loaded->memline = memline;
876 loaded->iaddr = bb_base + current_ii->instr_offset;
877 loaded->use_base = (CLG_(current_state).nonskipped) ?
878 CLG_(current_state).nonskipped->skipped :
879 cost_base + current_ii->cost_offset;
880}
881
882static
883CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
884{
885 UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
weidendo144b76c2009-01-26 22:56:14 +0000886 UWord* set = &(L2.tags[setNo * L2.assoc]);
weidendoa17f2a32006-03-20 10:27:30 +0000887 UWord tag = memline & L2.tag_mask;
888
889 int i, j, idx;
890 UWord tmp_tag;
891
barta0b6b2c2008-07-07 06:49:24 +0000892 CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
weidendoa17f2a32006-03-20 10:27:30 +0000893
894 if (tag == (set[0] & L2.tag_mask)) {
weidendo144b76c2009-01-26 22:56:14 +0000895 idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
weidendoa17f2a32006-03-20 10:27:30 +0000896 l1_loaded->dep_use = &(L2.use[idx]);
897
barta0b6b2c2008-07-07 06:49:24 +0000898 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +0000899 idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
900 L2.use[idx].mask, L2.use[idx].count);
901 return L2_Hit;
902 }
903 for (i = 1; i < L2.assoc; i++) {
904 if (tag == (set[i] & L2.tag_mask)) {
905 tmp_tag = set[i];
906 for (j = i; j > 0; j--) {
907 set[j] = set[j - 1];
908 }
909 set[0] = tmp_tag;
weidendo144b76c2009-01-26 22:56:14 +0000910 idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
weidendoa17f2a32006-03-20 10:27:30 +0000911 l1_loaded->dep_use = &(L2.use[idx]);
912
barta0b6b2c2008-07-07 06:49:24 +0000913 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendoa17f2a32006-03-20 10:27:30 +0000914 i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
915 L2.use[idx].mask, L2.use[idx].count);
916 return L2_Hit;
917 }
918 }
919
920 /* A miss; install this tag as MRU, shuffle rest down. */
921 tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
922 for (j = L2.assoc - 1; j > 0; j--) {
923 set[j] = set[j - 1];
924 }
925 set[0] = tag | tmp_tag;
weidendo144b76c2009-01-26 22:56:14 +0000926 idx = (setNo * L2.assoc) + tmp_tag;
weidendoa17f2a32006-03-20 10:27:30 +0000927 l1_loaded->dep_use = &(L2.use[idx]);
928
929 update_L2_use(idx, memline);
930
931 return MemAccess;
932}
933
934
935
936
937#define UPDATE_USE(L) \
938 \
939static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
940 UInt mask, Addr memline) \
941{ \
942 line_loaded* loaded = &(cache->loaded[idx]); \
943 line_use* use = &(cache->use[idx]); \
944 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
945 \
barta0b6b2c2008-07-07 06:49:24 +0000946 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
weidendoa17f2a32006-03-20 10:27:30 +0000947 cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
948 if (use->count>0) { \
barta0b6b2c2008-07-07 06:49:24 +0000949 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
weidendoa17f2a32006-03-20 10:27:30 +0000950 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
951 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
952 CLG_(current_state).collect, loaded->use_base); \
953 \
954 if (CLG_(current_state).collect && loaded->use_base) { \
955 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
956 (loaded->use_base)[off_##L##_SpLoss] += c; \
957 \
958 /* FIXME (?): L1/L2 line sizes must be equal ! */ \
959 loaded->dep_use->mask |= use->mask; \
960 loaded->dep_use->count += use->count; \
961 } \
962 } \
963 \
964 use->count = 1; \
965 use->mask = mask; \
966 loaded->memline = memline; \
967 loaded->iaddr = bb_base + current_ii->instr_offset; \
968 loaded->use_base = (CLG_(current_state).nonskipped) ? \
969 CLG_(current_state).nonskipped->skipped : \
970 cost_base + current_ii->cost_offset; \
971 \
972 if (memline == 0) return L2_Hit; \
973 return cacheuse_L2_access(memline, loaded); \
974}
975
976UPDATE_USE(I1);
977UPDATE_USE(D1);
978
979CACHEUSE(I1);
980CACHEUSE(D1);
981
982
983static
984void cacheuse_finish(void)
985{
986 int i;
987 InstrInfo ii = { 0,0,0,0,0 };
988
989 if (!CLG_(current_state).collect) return;
990
991 bb_base = 0;
992 current_ii = &ii;
993 cost_base = 0;
994
995 /* update usage counters */
996 if (I1.use)
997 for (i = 0; i < I1.sets * I1.assoc; i++)
998 if (I1.loaded[i].use_base)
999 update_I1_use( &I1, i, 0,0);
1000
1001 if (D1.use)
1002 for (i = 0; i < D1.sets * D1.assoc; i++)
1003 if (D1.loaded[i].use_base)
1004 update_D1_use( &D1, i, 0,0);
1005
1006 if (L2.use)
1007 for (i = 0; i < L2.sets * L2.assoc; i++)
1008 if (L2.loaded[i].use_base)
1009 update_L2_use(i, 0);
1010}
1011
1012
1013
1014/*------------------------------------------------------------*/
1015/*--- Helper functions called by instrumented code ---*/
1016/*------------------------------------------------------------*/
1017
1018
1019static __inline__
1020void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1021{
1022 switch(r) {
1023 case WriteBackMemAccess:
1024 if (clo_simulate_writeback) {
1025 c1[3]++;
1026 c2[3]++;
1027 }
1028 // fall through
1029
1030 case MemAccess:
1031 c1[2]++;
1032 c2[2]++;
1033 // fall through
1034
1035 case L2_Hit:
1036 c1[1]++;
1037 c2[1]++;
1038 // fall through
1039
1040 default:
1041 c1[0]++;
1042 c2[0]++;
1043 }
1044}
1045
1046
1047VG_REGPARM(1)
1048static void log_1I0D(InstrInfo* ii)
1049{
1050 CacheModelResult IrRes;
1051
1052 current_ii = ii;
1053 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1054
barta0b6b2c2008-07-07 06:49:24 +00001055 CLG_DEBUG(6, "log_1I0D: Ir=%#lx/%u => Ir %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001056 bb_base + ii->instr_offset, ii->instr_size, IrRes);
1057
1058 if (CLG_(current_state).collect) {
1059 ULong* cost_Ir;
1060
1061 if (CLG_(current_state).nonskipped)
1062 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1063 else
1064 cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
1065
1066 inc_costs(IrRes, cost_Ir,
1067 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1068 }
1069}
1070
1071
1072/* Instruction doing a read access */
1073
1074VG_REGPARM(2)
1075static void log_1I1Dr(InstrInfo* ii, Addr data)
1076{
1077 CacheModelResult IrRes, DrRes;
1078
1079 current_ii = ii;
1080 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1081 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1082
barta0b6b2c2008-07-07 06:49:24 +00001083 CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001084 bb_base + ii->instr_offset, ii->instr_size,
1085 data, ii->data_size, IrRes, DrRes);
1086
1087 if (CLG_(current_state).collect) {
1088 ULong *cost_Ir, *cost_Dr;
1089
1090 if (CLG_(current_state).nonskipped) {
1091 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1092 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1093 }
1094 else {
1095 cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
1096 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1097 }
1098
1099 inc_costs(IrRes, cost_Ir,
1100 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1101 inc_costs(DrRes, cost_Dr,
1102 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1103 }
1104}
1105
1106
1107VG_REGPARM(2)
1108static void log_0I1Dr(InstrInfo* ii, Addr data)
1109{
1110 CacheModelResult DrRes;
1111
1112 current_ii = ii;
1113 DrRes = (*simulator.D1_Read)(data, ii->data_size);
1114
barta0b6b2c2008-07-07 06:49:24 +00001115 CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001116 data, ii->data_size, DrRes);
1117
1118 if (CLG_(current_state).collect) {
1119 ULong *cost_Dr;
1120
1121 if (CLG_(current_state).nonskipped) {
1122 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1123 }
1124 else {
1125 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1126 }
1127
1128 inc_costs(DrRes, cost_Dr,
1129 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1130 }
1131}
1132
1133
1134/* Instruction doing a write access */
1135
1136VG_REGPARM(2)
1137static void log_1I1Dw(InstrInfo* ii, Addr data)
1138{
1139 CacheModelResult IrRes, DwRes;
1140
1141 current_ii = ii;
1142 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1143 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1144
barta0b6b2c2008-07-07 06:49:24 +00001145 CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001146 bb_base + ii->instr_offset, ii->instr_size,
1147 data, ii->data_size, IrRes, DwRes);
1148
1149 if (CLG_(current_state).collect) {
1150 ULong *cost_Ir, *cost_Dw;
1151
1152 if (CLG_(current_state).nonskipped) {
1153 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1154 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1155 }
1156 else {
1157 cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
1158 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1159 }
1160
1161 inc_costs(IrRes, cost_Ir,
1162 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1163 inc_costs(DwRes, cost_Dw,
1164 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1165 }
1166}
1167
1168VG_REGPARM(2)
1169static void log_0I1Dw(InstrInfo* ii, Addr data)
1170{
1171 CacheModelResult DwRes;
1172
1173 current_ii = ii;
1174 DwRes = (*simulator.D1_Write)(data, ii->data_size);
1175
barta0b6b2c2008-07-07 06:49:24 +00001176 CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001177 data, ii->data_size, DwRes);
1178
1179 if (CLG_(current_state).collect) {
1180 ULong *cost_Dw;
1181
1182 if (CLG_(current_state).nonskipped) {
1183 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1184 }
1185 else {
1186 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1187 }
1188
1189 inc_costs(DwRes, cost_Dw,
1190 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1191 }
1192}
1193
1194/* Instruction doing a read and a write access */
1195
1196VG_REGPARM(3)
1197static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
1198{
1199 CacheModelResult IrRes, DrRes, DwRes;
1200
1201 current_ii = ii;
1202 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1203 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1204 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1205
1206 CLG_DEBUG(6,
barta0b6b2c2008-07-07 06:49:24 +00001207 "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001208 bb_base + ii->instr_offset, ii->instr_size,
1209 data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
1210
1211 if (CLG_(current_state).collect) {
1212 ULong *cost_Ir, *cost_Dr, *cost_Dw;
1213
1214 if (CLG_(current_state).nonskipped) {
1215 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1216 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1217 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1218 }
1219 else {
1220 cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
1221 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1222 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1223 }
1224
1225 inc_costs(IrRes, cost_Ir,
1226 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1227 inc_costs(DrRes, cost_Dr,
1228 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1229 inc_costs(DwRes, cost_Dw,
1230 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1231 }
1232}
1233
1234VG_REGPARM(3)
1235static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
1236{
1237 CacheModelResult DrRes, DwRes;
1238
1239 current_ii = ii;
1240 DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1241 DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1242
1243 CLG_DEBUG(6,
barta0b6b2c2008-07-07 06:49:24 +00001244 "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
weidendoa17f2a32006-03-20 10:27:30 +00001245 data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
1246
1247 if (CLG_(current_state).collect) {
1248 ULong *cost_Dr, *cost_Dw;
1249
1250 if (CLG_(current_state).nonskipped) {
1251 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1252 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1253 }
1254 else {
1255 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1256 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1257 }
1258
1259 inc_costs(DrRes, cost_Dr,
1260 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1261 inc_costs(DwRes, cost_Dw,
1262 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1263 }
1264}
1265
1266
1267/*------------------------------------------------------------*/
1268/*--- Cache configuration ---*/
1269/*------------------------------------------------------------*/
1270
1271#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
1272
1273static cache_t clo_I1_cache = UNDEFINED_CACHE;
1274static cache_t clo_D1_cache = UNDEFINED_CACHE;
1275static cache_t clo_L2_cache = UNDEFINED_CACHE;
1276
1277
1278/* Checks cache config is ok; makes it so if not. */
1279static
1280void check_cache(cache_t* cache, Char *name)
1281{
weidendo144b76c2009-01-26 22:56:14 +00001282 /* Simulator requires line size and set count to be powers of two */
1283 if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
1284 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
weidendoa17f2a32006-03-20 10:27:30 +00001285 VG_(message)(Vg_UserMsg,
weidendo144b76c2009-01-26 22:56:14 +00001286 "error: %s set count not a power of two; aborting.",
1287 name);
weidendoa17f2a32006-03-20 10:27:30 +00001288 }
1289
weidendo144b76c2009-01-26 22:56:14 +00001290 if (-1 == VG_(log2)(cache->line_size)) {
weidendoa17f2a32006-03-20 10:27:30 +00001291 VG_(message)(Vg_UserMsg,
1292 "error: %s line size of %dB not a power of two; aborting.",
1293 name, cache->line_size);
1294 VG_(exit)(1);
1295 }
1296
1297 // Then check line size >= 16 -- any smaller and a single instruction could
1298 // straddle three cache lines, which breaks a simulation assertion and is
1299 // stupid anyway.
1300 if (cache->line_size < MIN_LINE_SIZE) {
1301 VG_(message)(Vg_UserMsg,
1302 "error: %s line size of %dB too small; aborting.",
1303 name, cache->line_size);
1304 VG_(exit)(1);
1305 }
1306
1307 /* Then check cache size > line size (causes seg faults if not). */
1308 if (cache->size <= cache->line_size) {
1309 VG_(message)(Vg_UserMsg,
1310 "error: %s cache size of %dB <= line size of %dB; aborting.",
1311 name, cache->size, cache->line_size);
1312 VG_(exit)(1);
1313 }
1314
1315 /* Then check assoc <= (size / line size) (seg faults otherwise). */
1316 if (cache->assoc > (cache->size / cache->line_size)) {
1317 VG_(message)(Vg_UserMsg,
1318 "warning: %s associativity > (size / line size); aborting.", name);
1319 VG_(exit)(1);
1320 }
1321}
1322
1323static
1324void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1325{
1326#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
1327
1328 Int n_clos = 0;
1329
1330 // Count how many were defined on the command line.
1331 if (DEFINED(clo_I1_cache)) { n_clos++; }
1332 if (DEFINED(clo_D1_cache)) { n_clos++; }
1333 if (DEFINED(clo_L2_cache)) { n_clos++; }
1334
1335 // Set the cache config (using auto-detection, if supported by the
1336 // architecture)
1337 VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1338
1339 // Then replace with any defined on the command line.
1340 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1341 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1342 if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1343
1344 // Then check values and fix if not acceptable.
1345 check_cache(I1c, "I1");
1346 check_cache(D1c, "D1");
1347 check_cache(L2c, "L2");
1348
1349 if (VG_(clo_verbosity) > 1) {
1350 VG_(message)(Vg_UserMsg, "Cache configuration used:");
1351 VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
1352 I1c->size, I1c->assoc, I1c->line_size);
1353 VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
1354 D1c->size, D1c->assoc, D1c->line_size);
1355 VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
1356 L2c->size, L2c->assoc, L2c->line_size);
1357 }
1358#undef CMD_LINE_DEFINED
1359}
1360
1361
1362/* Initialize and clear simulator state */
1363static void cachesim_post_clo_init(void)
1364{
1365 /* Cache configurations. */
1366 cache_t I1c, D1c, L2c;
1367
1368 /* Initialize access handlers */
1369 if (!CLG_(clo).simulate_cache) {
1370 CLG_(cachesim).log_1I0D = 0;
1371 CLG_(cachesim).log_1I0D_name = "(no function)";
1372
1373 CLG_(cachesim).log_1I1Dr = 0;
1374 CLG_(cachesim).log_1I1Dw = 0;
1375 CLG_(cachesim).log_1I2D = 0;
1376 CLG_(cachesim).log_1I1Dr_name = "(no function)";
1377 CLG_(cachesim).log_1I1Dw_name = "(no function)";
1378 CLG_(cachesim).log_1I2D_name = "(no function)";
1379
1380 CLG_(cachesim).log_0I1Dr = 0;
1381 CLG_(cachesim).log_0I1Dw = 0;
1382 CLG_(cachesim).log_0I2D = 0;
1383 CLG_(cachesim).log_0I1Dr_name = "(no function)";
1384 CLG_(cachesim).log_0I1Dw_name = "(no function)";
1385 CLG_(cachesim).log_0I2D_name = "(no function)";
1386 return;
1387 }
1388
1389 /* Configuration of caches only needed with real cache simulation */
1390 configure_caches(&I1c, &D1c, &L2c);
1391
1392 I1.name = "I1";
1393 D1.name = "D1";
1394 L2.name = "L2";
1395
1396 cachesim_initcache(I1c, &I1);
1397 cachesim_initcache(D1c, &D1);
1398 cachesim_initcache(L2c, &L2);
1399
1400 /* the other cache simulators use the standard helpers
1401 * with dispatching via simulator struct */
1402
1403 CLG_(cachesim).log_1I0D = log_1I0D;
1404 CLG_(cachesim).log_1I0D_name = "log_1I0D";
1405
1406 CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1407 CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1408 CLG_(cachesim).log_1I2D = log_1I2D;
1409 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1410 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1411 CLG_(cachesim).log_1I2D_name = "log_1I2D";
1412
1413 CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1414 CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1415 CLG_(cachesim).log_0I2D = log_0I2D;
1416 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1417 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1418 CLG_(cachesim).log_0I2D_name = "log_0I2D";
1419
1420 if (clo_collect_cacheuse) {
1421
1422 /* Output warning for not supported option combinations */
1423 if (clo_simulate_hwpref) {
1424 VG_(message)(Vg_DebugMsg,
1425 "warning: prefetch simulation can not be used with cache usage");
1426 clo_simulate_hwpref = False;
1427 }
1428
1429 if (clo_simulate_writeback) {
1430 VG_(message)(Vg_DebugMsg,
1431 "warning: write-back simulation can not be used with cache usage");
1432 clo_simulate_writeback = False;
1433 }
1434
1435 simulator.I1_Read = cacheuse_I1_doRead;
1436 simulator.D1_Read = cacheuse_D1_doRead;
1437 simulator.D1_Write = cacheuse_D1_doRead;
1438 return;
1439 }
1440
1441 if (clo_simulate_hwpref) {
1442 prefetch_clear();
1443
1444 if (clo_simulate_writeback) {
1445 simulator.I1_Read = prefetch_I1_Read;
1446 simulator.D1_Read = prefetch_D1_Read;
1447 simulator.D1_Write = prefetch_D1_Write;
1448 }
1449 else {
1450 simulator.I1_Read = prefetch_I1_ref;
1451 simulator.D1_Read = prefetch_D1_ref;
1452 simulator.D1_Write = prefetch_D1_ref;
1453 }
1454
1455 return;
1456 }
1457
1458 if (clo_simulate_writeback) {
1459 simulator.I1_Read = cachesim_I1_Read;
1460 simulator.D1_Read = cachesim_D1_Read;
1461 simulator.D1_Write = cachesim_D1_Write;
1462 }
1463 else {
1464 simulator.I1_Read = cachesim_I1_ref;
1465 simulator.D1_Read = cachesim_D1_ref;
1466 simulator.D1_Write = cachesim_D1_ref;
1467 }
1468}
1469
1470
1471/* Clear simulator state. Has to be initialized before */
1472static
1473void cachesim_clear(void)
1474{
1475 cachesim_clearcache(&I1);
1476 cachesim_clearcache(&D1);
1477 cachesim_clearcache(&L2);
1478
1479 prefetch_clear();
1480}
1481
1482
1483static void cachesim_getdesc(Char* buf)
1484{
1485 Int p;
1486 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1487 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1488 VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1489}
1490
1491static
1492void cachesim_print_opts(void)
1493{
1494 VG_(printf)(
1495"\n cache simulator options:\n"
1496" --simulate-cache=no|yes Do cache simulation [no]\n"
1497" --simulate-wb=no|yes Count write-back events [no]\n"
1498" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1499#if CLG_EXPERIMENTAL
1500" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1501#endif
1502" --cacheuse=no|yes Collect cache block use [no]\n"
1503" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
1504" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
1505" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
1506 );
1507}
1508
1509static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
1510{
1511 int i1, i2, i3;
1512 int i;
sewardj9c606bd2008-09-18 18:12:50 +00001513 char *opt = VG_(strdup)("cl.sim.po.1", orig_opt);
weidendoa17f2a32006-03-20 10:27:30 +00001514
1515 i = i1 = opt_len;
1516
1517 /* Option looks like "--I1=65536,2,64".
1518 * Find commas, replace with NULs to make three independent
1519 * strings, then extract numbers. Yuck. */
1520 while (VG_(isdigit)(opt[i])) i++;
1521 if (',' == opt[i]) {
1522 opt[i++] = '\0';
1523 i2 = i;
1524 } else goto bad;
1525 while (VG_(isdigit)(opt[i])) i++;
1526 if (',' == opt[i]) {
1527 opt[i++] = '\0';
1528 i3 = i;
1529 } else goto bad;
1530 while (VG_(isdigit)(opt[i])) i++;
1531 if ('\0' != opt[i]) goto bad;
1532
1533 cache->size = (Int)VG_(atoll)(opt + i1);
1534 cache->assoc = (Int)VG_(atoll)(opt + i2);
1535 cache->line_size = (Int)VG_(atoll)(opt + i3);
1536
1537 VG_(free)(opt);
1538
1539 return;
1540
1541 bad:
sewardj6893d652006-10-15 01:25:13 +00001542 VG_(err_bad_option)(orig_opt);
weidendoa17f2a32006-03-20 10:27:30 +00001543}
1544
1545/* Check for command line option for cache configuration.
1546 * Return False if unknown and not handled.
1547 *
1548 * Called from CLG_(process_cmd_line_option)() in clo.c
1549 */
1550static Bool cachesim_parse_opt(Char* arg)
1551{
1552 if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
1553 clo_simulate_writeback = True;
1554 else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
1555 clo_simulate_writeback = False;
1556
1557 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
1558 clo_simulate_hwpref = True;
1559 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
1560 clo_simulate_hwpref = False;
1561
1562 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
1563 clo_simulate_sectors = True;
1564 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
1565 clo_simulate_sectors = False;
1566
1567 else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
1568 clo_collect_cacheuse = True;
1569 /* Use counters only make sense with fine dumping */
1570 CLG_(clo).dump_instr = True;
1571 }
1572 else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
1573 clo_collect_cacheuse = False;
1574
1575 /* 5 is length of "--I1=" */
1576 else if (0 == VG_(strncmp)(arg, "--I1=", 5))
1577 parse_opt(&clo_I1_cache, arg, 5);
1578 else if (0 == VG_(strncmp)(arg, "--D1=", 5))
1579 parse_opt(&clo_D1_cache, arg, 5);
1580 else if (0 == VG_(strncmp)(arg, "--L2=", 5))
1581 parse_opt(&clo_L2_cache, arg, 5);
1582 else
1583 return False;
1584
1585 return True;
1586}
1587
1588/* Adds commas to ULong, right justifying in a field field_width wide, returns
1589 * the string in buf. */
1590static
1591Int commify(ULong n, int field_width, char* buf)
1592{
1593 int len, n_commas, i, j, new_len, space;
1594
1595 VG_(sprintf)(buf, "%llu", n);
1596 len = VG_(strlen)(buf);
1597 n_commas = (len - 1) / 3;
1598 new_len = len + n_commas;
1599 space = field_width - new_len;
1600
1601 /* Allow for printing a number in a field_width smaller than it's size */
1602 if (space < 0) space = 0;
1603
1604 /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1605 * of three. */
1606 for (j = -1, i = len ; i >= 0; i--) {
1607 buf[i + n_commas + space] = buf[i];
1608
1609 if ((i>0) && (3 == ++j)) {
1610 j = 0;
1611 n_commas--;
1612 buf[i + n_commas + space] = ',';
1613 }
1614 }
1615 /* Right justify in field. */
1616 for (i = 0; i < space; i++) buf[i] = ' ';
1617 return new_len;
1618}
1619
1620static
1621void percentify(Int n, Int ex, Int field_width, char buf[])
1622{
1623 int i, len, space;
1624
1625 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1626 len = VG_(strlen)(buf);
1627 space = field_width - len;
1628 if (space < 0) space = 0; /* Allow for v. small field_width */
1629 i = len;
1630
1631 /* Right justify in field */
1632 for ( ; i >= 0; i--) buf[i + space] = buf[i];
1633 for (i = 0; i < space; i++) buf[i] = ' ';
1634}
1635
1636static
1637void cachesim_printstat(void)
1638{
1639 FullCost total = CLG_(total_cost), D_total = 0;
1640 ULong L2_total_m, L2_total_mr, L2_total_mw,
1641 L2_total, L2_total_r, L2_total_w;
1642 char buf1[RESULTS_BUF_LEN],
1643 buf2[RESULTS_BUF_LEN],
1644 buf3[RESULTS_BUF_LEN];
1645 Int l1, l2, l3;
1646 Int p;
1647
1648 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1649 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
1650 prefetch_up);
1651 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
1652 prefetch_down);
1653 VG_(message)(Vg_DebugMsg, "");
1654 }
1655
1656 /* I cache results. Use the I_refs value to determine the first column
1657 * width. */
1658 l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1659 VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
1660
1661 if (!CLG_(clo).simulate_cache) return;
1662
1663 commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1664 VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
1665
1666 commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1667 VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
1668
1669 p = 100;
1670
1671 if (0 == total[CLG_(sets).off_full_Ir])
1672 total[CLG_(sets).off_full_Ir] = 1;
1673
1674 percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1675 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1676 VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
1677
1678 percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1679 total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1680 VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
1681 VG_(message)(Vg_UserMsg, "");
1682
1683 /* D cache results.
1684 Use the D_refs.rd and D_refs.wr values to determine the
1685 * width of columns 2 & 3. */
1686
1687 D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1688 CLG_(init_cost)( CLG_(sets).full, D_total);
1689 CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1690 CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1691
1692 commify( D_total[0], l1, buf1);
1693 l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
1694 l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
1695 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
1696 buf1, buf2, buf3);
1697
1698 commify( D_total[1], l1, buf1);
1699 commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1700 commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1701 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
1702 buf1, buf2, buf3);
1703
1704 commify( D_total[2], l1, buf1);
1705 commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1706 commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1707 VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
1708 buf1, buf2, buf3);
1709
1710 p = 10;
1711
1712 if (0 == D_total[0]) D_total[0] = 1;
1713 if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1714 if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1715
1716 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
1717 percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1718 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1719 percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1720 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1721 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
1722
1723 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
1724 percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1725 total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1726 percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1727 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1728 VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
1729 VG_(message)(Vg_UserMsg, "");
1730
1731
1732
1733 /* L2 overall results */
1734
1735 L2_total =
1736 total[CLG_(sets).off_full_Dr +1] +
1737 total[CLG_(sets).off_full_Dw +1] +
1738 total[CLG_(sets).off_full_Ir +1];
1739 L2_total_r =
1740 total[CLG_(sets).off_full_Dr +1] +
1741 total[CLG_(sets).off_full_Ir +1];
1742 L2_total_w = total[CLG_(sets).off_full_Dw +1];
1743 commify(L2_total, l1, buf1);
1744 commify(L2_total_r, l2, buf2);
1745 commify(L2_total_w, l3, buf3);
1746 VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
1747 buf1, buf2, buf3);
1748
1749 L2_total_m =
1750 total[CLG_(sets).off_full_Dr +2] +
1751 total[CLG_(sets).off_full_Dw +2] +
1752 total[CLG_(sets).off_full_Ir +2];
1753 L2_total_mr =
1754 total[CLG_(sets).off_full_Dr +2] +
1755 total[CLG_(sets).off_full_Ir +2];
1756 L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1757 commify(L2_total_m, l1, buf1);
1758 commify(L2_total_mr, l2, buf2);
1759 commify(L2_total_mw, l3, buf3);
1760 VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
1761 buf1, buf2, buf3);
1762
1763 percentify(L2_total_m * 100 * p /
1764 (total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
1765 percentify(L2_total_mr * 100 * p /
1766 (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1767 p, l2+1, buf2);
1768 percentify(L2_total_mw * 100 * p /
1769 total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1770 VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
1771 buf1, buf2,buf3);
1772}
1773
1774
1775/*------------------------------------------------------------*/
1776/*--- Setup for Event set. ---*/
1777/*------------------------------------------------------------*/
1778
1779struct event_sets CLG_(sets);
1780
1781void CLG_(init_eventsets)(Int max_user)
1782{
1783 EventType * e1, *e2, *e3, *e4;
1784 EventSet *Ir, *Dr, *Dw;
1785 EventSet *D0, *D1r, *D1w, *D2;
1786 EventSet *sim, *full;
1787 EventSet *use;
1788 int sizeOfUseIr;
1789
1790 use = CLG_(get_eventset)("Use", 4);
1791 if (clo_collect_cacheuse) {
1792 /* if TUse is 0, there was never a load, and no loss, too */
1793 e1 = CLG_(register_eventtype)("AcCost1");
1794 CLG_(add_eventtype)(use, e1);
1795 e1 = CLG_(register_eventtype)("SpLoss1");
1796 CLG_(add_eventtype)(use, e1);
1797 e1 = CLG_(register_eventtype)("AcCost2");
1798 CLG_(add_eventtype)(use, e1);
1799 e1 = CLG_(register_eventtype)("SpLoss2");
1800 CLG_(add_eventtype)(use, e1);
1801 }
1802
1803 Ir = CLG_(get_eventset)("Ir", 4);
1804 Dr = CLG_(get_eventset)("Dr", 4);
1805 Dw = CLG_(get_eventset)("Dw", 4);
1806 if (CLG_(clo).simulate_cache) {
1807 e1 = CLG_(register_eventtype)("Ir");
1808 e2 = CLG_(register_eventtype)("I1mr");
1809 e3 = CLG_(register_eventtype)("I2mr");
1810 if (clo_simulate_writeback) {
1811 e4 = CLG_(register_eventtype)("I2dmr");
1812 CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1813 }
1814 else
1815 CLG_(add_dep_event3)(Ir, e1,e2,e3);
1816
1817 e1 = CLG_(register_eventtype)("Dr");
1818 e2 = CLG_(register_eventtype)("D1mr");
1819 e3 = CLG_(register_eventtype)("D2mr");
1820 if (clo_simulate_writeback) {
1821 e4 = CLG_(register_eventtype)("D2dmr");
1822 CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1823 }
1824 else
1825 CLG_(add_dep_event3)(Dr, e1,e2,e3);
1826
1827 e1 = CLG_(register_eventtype)("Dw");
1828 e2 = CLG_(register_eventtype)("D1mw");
1829 e3 = CLG_(register_eventtype)("D2mw");
1830 if (clo_simulate_writeback) {
1831 e4 = CLG_(register_eventtype)("D2dmw");
1832 CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1833 }
1834 else
1835 CLG_(add_dep_event3)(Dw, e1,e2,e3);
1836
1837 }
1838 else {
1839 e1 = CLG_(register_eventtype)("Ir");
1840 CLG_(add_eventtype)(Ir, e1);
1841 }
1842
1843 sizeOfUseIr = use->size + Ir->size;
1844 D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
1845 CLG_(add_eventset)(D0, use);
1846 off_D0_Ir = CLG_(add_eventset)(D0, Ir);
1847
1848 D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
1849 CLG_(add_eventset)(D1r, use);
1850 off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
1851 off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
1852
1853 D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
1854 CLG_(add_eventset)(D1w, use);
1855 off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
1856 off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
1857
1858 D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
1859 CLG_(add_eventset)(D2, use);
1860 off_D2_Ir = CLG_(add_eventset)(D2, Ir);
1861 off_D2_Dr = CLG_(add_eventset)(D2, Dr);
1862 off_D2_Dw = CLG_(add_eventset)(D2, Dw);
1863
1864 sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
1865 CLG_(add_eventset)(sim, use);
1866 CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
1867 CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
1868 CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
1869
1870 if (CLG_(clo).collect_alloc) max_user += 2;
1871 if (CLG_(clo).collect_systime) max_user += 2;
1872
1873 full = CLG_(get_eventset)("full", sim->size + max_user);
1874 CLG_(add_eventset)(full, sim);
1875 CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
1876 CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
1877 CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
1878
1879 CLG_(sets).use = use;
1880 CLG_(sets).Ir = Ir;
1881 CLG_(sets).Dr = Dr;
1882 CLG_(sets).Dw = Dw;
1883
1884 CLG_(sets).D0 = D0;
1885 CLG_(sets).D1r = D1r;
1886 CLG_(sets).D1w = D1w;
1887 CLG_(sets).D2 = D2;
1888
1889 CLG_(sets).sim = sim;
1890 CLG_(sets).full = full;
1891
1892 if (CLG_(clo).collect_alloc) {
1893 e1 = CLG_(register_eventtype)("allocCount");
1894 e2 = CLG_(register_eventtype)("allocSize");
1895 CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
1896 }
1897
1898 if (CLG_(clo).collect_systime) {
1899 e1 = CLG_(register_eventtype)("sysCount");
1900 e2 = CLG_(register_eventtype)("sysTime");
1901 CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
1902 }
1903
1904 CLG_DEBUGIF(1) {
1905 CLG_DEBUG(1, "EventSets:\n");
1906 CLG_(print_eventset)(-2, use);
1907 CLG_(print_eventset)(-2, Ir);
1908 CLG_(print_eventset)(-2, Dr);
1909 CLG_(print_eventset)(-2, Dw);
1910 CLG_(print_eventset)(-2, sim);
1911 CLG_(print_eventset)(-2, full);
1912 }
1913
1914 /* Not-existing events are silently ignored */
1915 CLG_(dumpmap) = CLG_(get_eventmapping)(full);
1916 CLG_(append_event)(CLG_(dumpmap), "Ir");
1917 CLG_(append_event)(CLG_(dumpmap), "Dr");
1918 CLG_(append_event)(CLG_(dumpmap), "Dw");
1919 CLG_(append_event)(CLG_(dumpmap), "I1mr");
1920 CLG_(append_event)(CLG_(dumpmap), "D1mr");
1921 CLG_(append_event)(CLG_(dumpmap), "D1mw");
1922 CLG_(append_event)(CLG_(dumpmap), "I2mr");
1923 CLG_(append_event)(CLG_(dumpmap), "D2mr");
1924 CLG_(append_event)(CLG_(dumpmap), "D2mw");
1925 CLG_(append_event)(CLG_(dumpmap), "I2dmr");
1926 CLG_(append_event)(CLG_(dumpmap), "D2dmr");
1927 CLG_(append_event)(CLG_(dumpmap), "D2dmw");
1928 CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1929 CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1930 CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1931 CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1932 CLG_(append_event)(CLG_(dumpmap), "allocCount");
1933 CLG_(append_event)(CLG_(dumpmap), "allocSize");
1934 CLG_(append_event)(CLG_(dumpmap), "sysCount");
1935 CLG_(append_event)(CLG_(dumpmap), "sysTime");
1936
1937}
1938
1939
1940
1941static
1942void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
1943{
1944 /* if eventset use is defined, it is always first (hardcoded!) */
1945 CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
1946
1947 /* FIXME: This is hardcoded... */
1948 if (es == CLG_(sets).D0) {
1949 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1950 cost + off_D0_Ir);
1951 }
1952 else if (es == CLG_(sets).D1r) {
1953 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1954 cost + off_D1r_Ir);
1955 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
1956 cost + off_D1r_Dr);
1957 }
1958 else if (es == CLG_(sets).D1w) {
1959 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1960 cost + off_D1w_Ir);
1961 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
1962 cost + off_D1w_Dw);
1963 }
1964 else {
1965 CLG_ASSERT(es == CLG_(sets).D2);
1966 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1967 cost + off_D2_Ir);
1968 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
1969 cost + off_D2_Dr);
1970 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
1971 cost + off_D2_Dw);
1972 }
1973}
1974
1975/* this is called at dump time for every instruction executed */
1976static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1977 InstrInfo* ii, ULong exe_count)
1978{
1979 if (!CLG_(clo).simulate_cache)
1980 cost[CLG_(sets).off_sim_Ir] += exe_count;
1981 else {
1982
1983#if 0
1984/* There is always a trivial case where exe_count and Ir can be
1985 * slightly different because ecounter is updated when executing
1986 * the next BB. E.g. for last BB executed, or when toggling collection
1987 */
1988 /* FIXME: Hardcoded that each eventset has Ir as first */
1989 if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
1990 VG_(printf)("==> Ir %llu, exe %llu\n",
1991 (bbcc->cost + ii->cost_offset)[0], exe_count);
1992 CLG_(print_bbcc_cost)(-2, bbcc);
1993 //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
1994 }
1995#endif
1996
1997 add_and_zero_Dx(ii->eventset, cost,
1998 bbcc->cost + ii->cost_offset);
1999 }
2000}
2001
2002static
2003void cachesim_after_bbsetup(void)
2004{
2005 BBCC* bbcc = CLG_(current_state).bbcc;
2006
2007 if (CLG_(clo).simulate_cache) {
2008 BB* bb = bbcc->bb;
2009
2010 /* only needed if log_* functions are called */
2011 bb_base = bb->obj->offset + bb->offset;
2012 cost_base = bbcc->cost;
2013 }
2014}
2015
2016static
2017void cachesim_finish(void)
2018{
2019 if (clo_collect_cacheuse)
2020 cacheuse_finish();
2021}
2022
2023/*------------------------------------------------------------*/
2024/*--- The simulator defined in this file ---*/
2025/*------------------------------------------------------------*/
2026
2027struct cachesim_if CLG_(cachesim) = {
2028 .print_opts = cachesim_print_opts,
2029 .parse_opt = cachesim_parse_opt,
2030 .post_clo_init = cachesim_post_clo_init,
2031 .clear = cachesim_clear,
2032 .getdesc = cachesim_getdesc,
2033 .printstat = cachesim_printstat,
2034 .add_icost = cachesim_add_icost,
2035 .after_bbsetup = cachesim_after_bbsetup,
2036 .finish = cachesim_finish,
2037
2038 /* these will be set by cachesim_post_clo_init */
2039 .log_1I0D = 0,
2040
2041 .log_1I1Dr = 0,
2042 .log_1I1Dw = 0,
2043 .log_1I2D = 0,
2044
2045 .log_0I1Dr = 0,
2046 .log_0I1Dw = 0,
2047 .log_0I2D = 0,
2048
2049 .log_1I0D_name = "(no function)",
2050
2051 .log_1I1Dr_name = "(no function)",
2052 .log_1I1Dw_name = "(no function)",
2053 .log_1I2D_name = "(no function)",
2054
2055 .log_0I1Dr_name = "(no function)",
2056 .log_0I1Dw_name = "(no function)",
2057 .log_0I2D_name = "(no function)"
2058};
2059
2060
2061/*--------------------------------------------------------------------*/
2062/*--- end ct_sim.c ---*/
2063/*--------------------------------------------------------------------*/
2064