Blame - callgrind/sim.c - platform/external/valgrind

blob: 3d9ae6c2b7bdcb4be94c2709a241843c1fab235f [file] [log] [blame]

weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1
				2	/--------------------------------------------------------------------/
				3	/--- Cache simulation. ---/
				4	/--- sim.c ---/
				5	/--------------------------------------------------------------------/
				6
				7	/*
njn	9a0cba4	2007-04-15 22:15:57 +0000	[diff] [blame]	8	This file is part of Callgrind, a Valgrind tool for call graph
				9	profiling programs.
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	10
njn	9a0cba4	2007-04-15 22:15:57 +0000	[diff] [blame]	11	Copyright (C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	12
njn	9a0cba4	2007-04-15 22:15:57 +0000	[diff] [blame]	13	This tool is derived from and contains code from Cachegrind
sewardj	4d474d0	2008-02-11 11:34:59 +0000	[diff] [blame]	14	Copyright (C) 2002-2008 Nicholas Nethercote (njn@valgrind.org)
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	15
				16	This program is free software; you can redistribute it and/or
				17	modify it under the terms of the GNU General Public License as
				18	published by the Free Software Foundation; either version 2 of the
				19	License, or (at your option) any later version.
				20
				21	This program is distributed in the hope that it will be useful, but
				22	WITHOUT ANY WARRANTY; without even the implied warranty of
				23	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				24	General Public License for more details.
				25
				26	You should have received a copy of the GNU General Public License
				27	along with this program; if not, write to the Free Software
				28	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
				29	02111-1307, USA.
				30
				31	The GNU General Public License is contained in the file COPYING.
				32	*/
				33
				34	#include "global.h"
				35
				36
				37	/* Notes:
				38	- simulates a write-allocate cache
				39	- (block --> set) hash function uses simple bit selection
				40	- handling of references straddling two cache blocks:
				41	- counts as only one cache access (not two)
				42	- both blocks hit --> one hit
				43	- one block hits, the other misses --> one miss
				44	- both blocks miss --> one miss (not two)
				45	*/
				46
				47	/* Cache configuration */
				48	#include "cg_arch.h"
				49
				50	/* additional structures for cache use info, separated
				51	* according usage frequency:
				52	* - line_loaded : pointer to cost center of instruction
				53	* which loaded the line into cache.
				54	* Needed to increment counters when line is evicted.
				55	* - line_use : updated on every access
				56	*/
				57	typedef struct {
				58	UInt count;
				59	UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
				60	} line_use;
				61
				62	typedef struct {
				63	Addr memline, iaddr;
				64	line_use* dep_use; /* point to higher-level cacheblock for this memline */
				65	ULong* use_base;
				66	} line_loaded;
				67
				68	/* Cache state */
				69	typedef struct {
				70	char* name;
				71	int size; /* bytes */
				72	int assoc;
				73	int line_size; /* bytes */
				74	Bool sectored; /* prefetch nearside cacheline on read */
				75	int sets;
				76	int sets_min_1;
				77	int assoc_bits;
				78	int line_size_bits;
				79	int tag_shift;
				80	UWord tag_mask;
				81	char desc_line[128];
				82	UWord* tags;
				83
				84	/* for cache use */
				85	int line_size_mask;
				86	int* line_start_mask;
				87	int* line_end_mask;
				88	line_loaded* loaded;
				89	line_use* use;
				90	} cache_t2;
				91
				92	/*
				93	* States of flat caches in our model.
				94	* We use a 2-level hierarchy,
				95	*/
				96	static cache_t2 I1, D1, L2;
				97
				98	/* Lower bits of cache tags are used as flags for a cache line */
				99	#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
				100	#define CACHELINE_DIRTY 1
				101
				102
				103	/* Cache simulator Options */
				104	static Bool clo_simulate_writeback = False;
				105	static Bool clo_simulate_hwpref = False;
				106	static Bool clo_simulate_sectors = False;
				107	static Bool clo_collect_cacheuse = False;
				108
				109	/* Following global vars are setup before by
				110	* setup_bbcc()/cachesim_after_bbsetup():
				111	*
				112	* - Addr bb_base (instruction start address of original BB)
				113	* - ULong* cost_base (start of cost array for BB)
				114	* - BBCC* nonskipped (only != 0 when in a function not skipped)
				115	*/
				116
				117	/* Offset to events in event set, used in log_* functions */
				118	static Int off_D0_Ir;
				119	static Int off_D1r_Ir;
				120	static Int off_D1r_Dr;
				121	static Int off_D1w_Ir;
				122	static Int off_D1w_Dw;
				123	static Int off_D2_Ir;
				124	static Int off_D2_Dr;
				125	static Int off_D2_Dw;
				126
				127	static Addr bb_base;
				128	static ULong* cost_base;
				129	static InstrInfo* current_ii;
				130
				131	/* Cache use offsets */
				132	/* FIXME: The offsets are only correct because all eventsets get
				133	* the "Use" set added first !
				134	*/
				135	static Int off_I1_AcCost = 0;
				136	static Int off_I1_SpLoss = 1;
				137	static Int off_D1_AcCost = 0;
				138	static Int off_D1_SpLoss = 1;
				139	static Int off_L2_AcCost = 2;
				140	static Int off_L2_SpLoss = 3;
				141
				142	/* Cache access types */
				143	typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
				144
				145	/* Result of a reference into a flat cache */
				146	typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
				147
				148	/* Result of a reference into a hierarchical cache model */
				149	typedef enum {
				150	L1_Hit,
				151	L2_Hit,
				152	MemAccess,
				153	WriteBackMemAccess } CacheModelResult;
				154
				155	typedef CacheModelResult (*simcall_type)(Addr, UChar);
				156
				157	static struct {
				158	simcall_type I1_Read;
				159	simcall_type D1_Read;
				160	simcall_type D1_Write;
				161	} simulator;
				162
				163	/------------------------------------------------------------/
				164	/--- Cache Simulator Initialization ---/
				165	/------------------------------------------------------------/
				166
				167	static void cachesim_clearcache(cache_t2* c)
				168	{
				169	Int i;
				170
				171	for (i = 0; i < c->sets * c->assoc; i++)
				172	c->tags[i] = 0;
				173	if (c->use) {
				174	for (i = 0; i < c->sets * c->assoc; i++) {
				175	c->loaded[i].memline = 0;
				176	c->loaded[i].use_base = 0;
				177	c->loaded[i].dep_use = 0;
				178	c->loaded[i].iaddr = 0;
				179	c->use[i].mask = 0;
				180	c->use[i].count = 0;
				181	c->tags[i] = i % c->assoc; /* init lower bits as pointer */
				182	}
				183	}
				184	}
				185
				186	static void cacheuse_initcache(cache_t2* c);
				187
				188	/* By this point, the size/assoc/line_size has been checked. */
				189	static void cachesim_initcache(cache_t config, cache_t2* c)
				190	{
				191	c->size = config.size;
				192	c->assoc = config.assoc;
				193	c->line_size = config.line_size;
				194	c->sectored = False; // FIXME
				195
				196	c->sets = (c->size / c->line_size) / c->assoc;
				197	c->sets_min_1 = c->sets - 1;
				198	c->assoc_bits = VG_(log2)(c->assoc);
				199	c->line_size_bits = VG_(log2)(c->line_size);
				200	c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
				201	c->tag_mask = ~((1<<c->tag_shift)-1);
				202
				203	/* Can bits in tag entries be used for flags?
				204	* Should be always true as MIN_LINE_SIZE >= 16 */
				205	CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
				206
				207	if (c->assoc == 1) {
				208	VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
				209	c->size, c->line_size,
				210	c->sectored ? ", sectored":"");
				211	} else {
				212	VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
				213	c->size, c->line_size, c->assoc,
				214	c->sectored ? ", sectored":"");
				215	}
				216
				217	c->tags = (UWord) CLG_MALLOC(sizeof(UWord) c->sets * c->assoc);
				218	if (clo_collect_cacheuse)
				219	cacheuse_initcache(c);
				220	else
				221	c->use = 0;
				222	cachesim_clearcache(c);
				223	}
				224
				225
				226	#if 0
				227	static void print_cache(cache_t2* c)
				228	{
				229	UInt set, way, i;
				230
				231	/* Note initialisation and update of 'i'. */
				232	for (i = 0, set = 0; set < c->sets; set++) {
				233	for (way = 0; way < c->assoc; way++, i++) {
				234	VG_(printf)("%8x ", c->tags[i]);
				235	}
				236	VG_(printf)("\n");
				237	}
				238	}
				239	#endif
				240
				241
				242	/------------------------------------------------------------/
				243	/--- Write Through Cache Simulation ---/
				244	/------------------------------------------------------------/
				245
				246	/*
				247	* Simple model: L1 & L2 Write Through
				248	* Does not distinguish among read and write references
				249	*
				250	* Simulator functions:
				251	* CacheModelResult cachesim_I1_ref(Addr a, UChar size)
				252	* CacheModelResult cachesim_D1_ref(Addr a, UChar size)
				253	*/
				254
				255	static __inline__
				256	CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
				257	{
				258	int i, j;
				259	UWord *set;
				260
				261	/* Shifting is a bit faster than multiplying */
				262	set = &(c->tags[set_no << c->assoc_bits]);
				263
				264	/* This loop is unrolled for just the first case, which is the most */
				265	/* common. We can't unroll any further because it would screw up */
				266	/* if we have a direct-mapped (1-way) cache. */
				267	if (tag == set[0])
				268	return Hit;
				269
				270	/* If the tag is one other than the MRU, move it into the MRU spot */
				271	/* and shuffle the rest down. */
				272	for (i = 1; i < c->assoc; i++) {
				273	if (tag == set[i]) {
				274	for (j = i; j > 0; j--) {
				275	set[j] = set[j - 1];
				276	}
				277	set[0] = tag;
				278	return Hit;
				279	}
				280	}
				281
				282	/* A miss; install this tag as MRU, shuffle rest down. */
				283	for (j = c->assoc - 1; j > 0; j--) {
				284	set[j] = set[j - 1];
				285	}
				286	set[0] = tag;
				287
				288	return Miss;
				289	}
				290
				291	static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
				292	{
				293	UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
				294	UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
				295	UWord tag = a >> c->tag_shift;
				296
				297	/* Access entirely within line. */
				298	if (set1 == set2)
				299	return cachesim_setref(c, set1, tag);
				300
				301	/* Access straddles two lines. */
				302	/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
				303	else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	304	UWord tag2 = (a+size-1) >> c->tag_shift;
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	305
				306	/* the call updates cache structures as side effect */
				307	CacheResult res1 = cachesim_setref(c, set1, tag);
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	308	CacheResult res2 = cachesim_setref(c, set2, tag2);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	309	return ((res1 == Miss) \|\| (res2 == Miss)) ? Miss : Hit;
				310
				311	} else {
njn	8a7b41b	2007-09-23 00:51:24 +0000	[diff] [blame]	312	VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	313	VG_(tool_panic)("item straddles more than two cache sets");
				314	}
				315	return Hit;
				316	}
				317
				318	static
				319	CacheModelResult cachesim_I1_ref(Addr a, UChar size)
				320	{
				321	if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
				322	if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
				323	return MemAccess;
				324	}
				325
				326	static
				327	CacheModelResult cachesim_D1_ref(Addr a, UChar size)
				328	{
				329	if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
				330	if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
				331	return MemAccess;
				332	}
				333
				334
				335	/------------------------------------------------------------/
				336	/--- Write Back Cache Simulation ---/
				337	/------------------------------------------------------------/
				338
				339	/*
				340	* More complex model: L1 Write-through, L2 Write-back
				341	* This needs to distinguish among read and write references.
				342	*
				343	* Simulator functions:
				344	* CacheModelResult cachesim_I1_Read(Addr a, UChar size)
				345	* CacheModelResult cachesim_D1_Read(Addr a, UChar size)
				346	* CacheModelResult cachesim_D1_Write(Addr a, UChar size)
				347	*/
				348
				349	/*
				350	* With write-back, result can be a miss evicting a dirty line
				351	* The dirty state of a cache line is stored in Bit0 of the tag for
				352	* this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
				353	* type (Read/Write), the line gets dirty on a write.
				354	*/
				355	static __inline__
				356	CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
				357	{
				358	int i, j;
				359	UWord *set, tmp_tag;
				360
				361	/* Shifting is a bit faster than multiplying */
				362	set = &(c->tags[set_no << c->assoc_bits]);
				363
				364	/* This loop is unrolled for just the first case, which is the most */
				365	/* common. We can't unroll any further because it would screw up */
				366	/* if we have a direct-mapped (1-way) cache. */
				367	if (tag == (set[0] & ~CACHELINE_DIRTY)) {
				368	set[0] \|= ref;
				369	return Hit;
				370	}
				371	/* If the tag is one other than the MRU, move it into the MRU spot */
				372	/* and shuffle the rest down. */
				373	for (i = 1; i < c->assoc; i++) {
				374	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
				375	tmp_tag = set[i] \| ref; // update dirty flag
				376	for (j = i; j > 0; j--) {
				377	set[j] = set[j - 1];
				378	}
				379	set[0] = tmp_tag;
				380	return Hit;
				381	}
				382	}
				383
				384	/* A miss; install this tag as MRU, shuffle rest down. */
				385	tmp_tag = set[c->assoc - 1];
				386	for (j = c->assoc - 1; j > 0; j--) {
				387	set[j] = set[j - 1];
				388	}
				389	set[0] = tag \| ref;
				390
				391	return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
				392	}
				393
				394
				395	static __inline__
				396	CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
				397	{
				398	UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
				399	UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
				400	UWord tag = a & c->tag_mask;
				401
				402	/* Access entirely within line. */
				403	if (set1 == set2)
				404	return cachesim_setref_wb(c, ref, set1, tag);
				405
				406	/* Access straddles two lines. */
				407	/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
				408	else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	409	UWord tag2 = (a+size-1) >> c->tag_shift;
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	410
				411	/* the call updates cache structures as side effect */
				412	CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	413	CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	414
				415	if ((res1 == MissDirty) \|\| (res2 == MissDirty)) return MissDirty;
				416	return ((res1 == Miss) \|\| (res2 == Miss)) ? Miss : Hit;
				417
				418	} else {
njn	8a7b41b	2007-09-23 00:51:24 +0000	[diff] [blame]	419	VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	420	VG_(tool_panic)("item straddles more than two cache sets");
				421	}
				422	return Hit;
				423	}
				424
				425
				426	static
				427	CacheModelResult cachesim_I1_Read(Addr a, UChar size)
				428	{
				429	if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
				430	switch( cachesim_ref_wb( &L2, Read, a, size) ) {
				431	case Hit: return L2_Hit;
				432	case Miss: return MemAccess;
				433	default: break;
				434	}
				435	return WriteBackMemAccess;
				436	}
				437
				438	static
				439	CacheModelResult cachesim_D1_Read(Addr a, UChar size)
				440	{
				441	if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
				442	switch( cachesim_ref_wb( &L2, Read, a, size) ) {
				443	case Hit: return L2_Hit;
				444	case Miss: return MemAccess;
				445	default: break;
				446	}
				447	return WriteBackMemAccess;
				448	}
				449
				450	static
				451	CacheModelResult cachesim_D1_Write(Addr a, UChar size)
				452	{
				453	if ( cachesim_ref( &D1, a, size) == Hit ) {
				454	/* Even for a L1 hit, the write-trough L1 passes
				455	* the write to the L2 to make the L2 line dirty.
				456	* But this causes no latency, so return the hit.
				457	*/
				458	cachesim_ref_wb( &L2, Write, a, size);
				459	return L1_Hit;
				460	}
				461	switch( cachesim_ref_wb( &L2, Write, a, size) ) {
				462	case Hit: return L2_Hit;
				463	case Miss: return MemAccess;
				464	default: break;
				465	}
				466	return WriteBackMemAccess;
				467	}
				468
				469
				470	/------------------------------------------------------------/
				471	/--- Hardware Prefetch Simulation ---/
				472	/------------------------------------------------------------/
				473
				474	static ULong prefetch_up = 0;
				475	static ULong prefetch_down = 0;
				476
				477	#define PF_STREAMS 8
				478	#define PF_PAGEBITS 12
				479
				480	static UInt pf_lastblock[PF_STREAMS];
				481	static Int pf_seqblocks[PF_STREAMS];
				482
				483	static
				484	void prefetch_clear(void)
				485	{
				486	int i;
				487	for(i=0;i<PF_STREAMS;i++)
				488	pf_lastblock[i] = pf_seqblocks[i] = 0;
				489	}
				490
				491	/*
				492	* HW Prefetch emulation
				493	* Start prefetching when detecting sequential access to 3 memory blocks.
				494	* One stream can be detected per 4k page.
				495	*/
				496	static __inline__
				497	void prefetch_L2_doref(Addr a, UChar size)
				498	{
				499	UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
				500	UInt block = ( a >> L2.line_size_bits);
				501
				502	if (block != pf_lastblock[stream]) {
				503	if (pf_seqblocks[stream] == 0) {
				504	if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
				505	else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
				506	}
				507	else if (pf_seqblocks[stream] >0) {
				508	if (pf_lastblock[stream] +1 == block) {
				509	pf_seqblocks[stream]++;
				510	if (pf_seqblocks[stream] >= 2) {
				511	prefetch_up++;
				512	cachesim_ref(&L2, a + 5 * L2.line_size,1);
				513	}
				514	}
				515	else pf_seqblocks[stream] = 0;
				516	}
				517	else if (pf_seqblocks[stream] <0) {
				518	if (pf_lastblock[stream] -1 == block) {
				519	pf_seqblocks[stream]--;
				520	if (pf_seqblocks[stream] <= -2) {
				521	prefetch_down++;
				522	cachesim_ref(&L2, a - 5 * L2.line_size,1);
				523	}
				524	}
				525	else pf_seqblocks[stream] = 0;
				526	}
				527	pf_lastblock[stream] = block;
				528	}
				529	}
				530
				531	/* simple model with hardware prefetch */
				532
				533	static
				534	CacheModelResult prefetch_I1_ref(Addr a, UChar size)
				535	{
				536	if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
				537	prefetch_L2_doref(a,size);
				538	if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
				539	return MemAccess;
				540	}
				541
				542	static
				543	CacheModelResult prefetch_D1_ref(Addr a, UChar size)
				544	{
				545	if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
				546	prefetch_L2_doref(a,size);
				547	if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
				548	return MemAccess;
				549	}
				550
				551
				552	/* complex model with hardware prefetch */
				553
				554	static
				555	CacheModelResult prefetch_I1_Read(Addr a, UChar size)
				556	{
				557	if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
				558	prefetch_L2_doref(a,size);
				559	switch( cachesim_ref_wb( &L2, Read, a, size) ) {
				560	case Hit: return L2_Hit;
				561	case Miss: return MemAccess;
				562	default: break;
				563	}
				564	return WriteBackMemAccess;
				565	}
				566
				567	static
				568	CacheModelResult prefetch_D1_Read(Addr a, UChar size)
				569	{
				570	if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
				571	prefetch_L2_doref(a,size);
				572	switch( cachesim_ref_wb( &L2, Read, a, size) ) {
				573	case Hit: return L2_Hit;
				574	case Miss: return MemAccess;
				575	default: break;
				576	}
				577	return WriteBackMemAccess;
				578	}
				579
				580	static
				581	CacheModelResult prefetch_D1_Write(Addr a, UChar size)
				582	{
				583	prefetch_L2_doref(a,size);
				584	if ( cachesim_ref( &D1, a, size) == Hit ) {
				585	/* Even for a L1 hit, the write-trough L1 passes
				586	* the write to the L2 to make the L2 line dirty.
				587	* But this causes no latency, so return the hit.
				588	*/
				589	cachesim_ref_wb( &L2, Write, a, size);
				590	return L1_Hit;
				591	}
				592	switch( cachesim_ref_wb( &L2, Write, a, size) ) {
				593	case Hit: return L2_Hit;
				594	case Miss: return MemAccess;
				595	default: break;
				596	}
				597	return WriteBackMemAccess;
				598	}
				599
				600
				601	/------------------------------------------------------------/
				602	/--- Cache Simulation with use metric collection ---/
				603	/------------------------------------------------------------/
				604
				605	/* can not be combined with write-back or prefetch */
				606
				607	static
				608	void cacheuse_initcache(cache_t2* c)
				609	{
				610	int i;
				611	unsigned int start_mask, start_val;
				612	unsigned int end_mask, end_val;
				613
				614	c->use = CLG_MALLOC(sizeof(line_use) * c->sets * c->assoc);
				615	c->loaded = CLG_MALLOC(sizeof(line_loaded) * c->sets * c->assoc);
				616	c->line_start_mask = CLG_MALLOC(sizeof(int) * c->line_size);
				617	c->line_end_mask = CLG_MALLOC(sizeof(int) * c->line_size);
				618
				619
				620	c->line_size_mask = c->line_size-1;
				621
				622	/* Meaning of line_start_mask/line_end_mask
				623	* Example: for a given cache line, you get an access starting at
				624	* byte offset 5, length 4, byte 5 - 8 was touched. For a cache
				625	* line size of 32, you have 1 bit per byte in the mask:
				626	*
				627	* bit31 bit8 bit5 bit 0
				628	* \| \| \| \|
				629	* 11..111111100000 line_start_mask[5]
				630	* 00..000111111111 line_end_mask[(5+4)-1]
				631	*
				632	* use_mask \|= line_start_mask[5] && line_end_mask[8]
				633	*
				634	*/
				635	start_val = end_val = ~0;
				636	if (c->line_size < 32) {
				637	int bits_per_byte = 32/c->line_size;
				638	start_mask = (1<<bits_per_byte)-1;
				639	end_mask = start_mask << (32-bits_per_byte);
				640	for(i=0;i<c->line_size;i++) {
				641	c->line_start_mask[i] = start_val;
				642	start_val = start_val & ~start_mask;
				643	start_mask = start_mask << bits_per_byte;
				644
				645	c->line_end_mask[c->line_size-i-1] = end_val;
				646	end_val = end_val & ~end_mask;
				647	end_mask = end_mask >> bits_per_byte;
				648	}
				649	}
				650	else {
				651	int bytes_per_bit = c->line_size/32;
				652	start_mask = 1;
				653	end_mask = 1 << 31;
				654	for(i=0;i<c->line_size;i++) {
				655	c->line_start_mask[i] = start_val;
				656	c->line_end_mask[c->line_size-i-1] = end_val;
				657	if ( ((i+1)%bytes_per_bit) == 0) {
				658	start_val &= ~start_mask;
				659	end_val &= ~end_mask;
				660	start_mask <<= 1;
				661	end_mask >>= 1;
				662	}
				663	}
				664	}
				665
				666	CLG_DEBUG(6, "Config %s:\n", c->desc_line);
				667	for(i=0;i<c->line_size;i++) {
				668	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
				669	i, c->line_start_mask[i], c->line_end_mask[i]);
				670	}
				671
				672	/* We use lower tag bits as offset pointers to cache use info.
				673	* I.e. some cache parameters don't work.
				674	*/
				675	if (c->tag_shift < c->assoc_bits) {
				676	VG_(message)(Vg_DebugMsg,
				677	"error: Use associativity < %d for cache use statistics!",
				678	(1<<c->tag_shift) );
				679	VG_(tool_panic)("Unsupported cache configuration");
				680	}
				681	}
				682
				683	/* FIXME: A little tricky */
				684	#if 0
				685
				686	static __inline__
				687	void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
				688	{
				689	int idx = (high_idx << c->assoc_bits) \| low_idx;
				690
				691	c->use[idx].count ++;
				692	c->use[idx].mask \|= use_mask;
				693
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	694	CLG_DEBUG(6," Hit [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	695	idx, c->loaded[idx].memline, c->loaded[idx].iaddr,
				696	use_mask, c->use[idx].mask, c->use[idx].count);
				697	}
				698
				699	/* only used for I1, D1 */
				700
				701	static __inline__
				702	CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
				703	{
				704	int i, j, idx;
				705	UWord *set, tmp_tag;
				706	UInt use_mask;
				707
				708	/* Shifting is a bit faster than multiplying */
				709	set = &(c->tags[set_no << c->assoc_bits]);
				710	use_mask =
				711	c->line_start_mask[a & c->line_size_mask] &
				712	c->line_end_mask[(a+size-1) & c->line_size_mask];
				713
				714	/* This loop is unrolled for just the first case, which is the most */
				715	/* common. We can't unroll any further because it would screw up */
				716	/* if we have a direct-mapped (1-way) cache. */
				717	if (tag == (set[0] & c->tag_mask)) {
				718	cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
				719	return L1_Hit;
				720	}
				721
				722	/* If the tag is one other than the MRU, move it into the MRU spot */
				723	/* and shuffle the rest down. */
				724	for (i = 1; i < c->assoc; i++) {
				725	if (tag == (set[i] & c->tag_mask)) {
				726	tmp_tag = set[i];
				727	for (j = i; j > 0; j--) {
				728	set[j] = set[j - 1];
				729	}
				730	set[0] = tmp_tag;
				731
				732	cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
				733	return L1_Hit;
				734	}
				735	}
				736
				737	/* A miss; install this tag as MRU, shuffle rest down. */
				738	tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
				739	for (j = c->assoc - 1; j > 0; j--) {
				740	set[j] = set[j - 1];
				741	}
				742	set[0] = tag \| tmp_tag;
				743
				744	cacheuse_L2_miss(c, (set_no << c->assoc_bits) \| tmp_tag,
				745	use_mask, a & ~c->line_size_mask);
				746
				747	return Miss;
				748	}
				749
				750
				751	static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
				752	{
				753	UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
				754	UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
				755	UWord tag = a >> c->tag_shift;
				756
				757	/* Access entirely within line. */
				758	if (set1 == set2)
				759	return cacheuse_setref(c, set1, tag);
				760
				761	/* Access straddles two lines. */
				762	/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
				763	else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	764	UWord tag2 = a >> c->tag_shift;
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	765
				766	/* the call updates cache structures as side effect */
				767	CacheResult res1 = cacheuse_isMiss(c, set1, tag);
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	768	CacheResult res2 = cacheuse_isMiss(c, set2, tag2);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	769	return ((res1 == Miss) \|\| (res2 == Miss)) ? Miss : Hit;
				770
				771	} else {
				772	VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
				773	VG_(tool_panic)("item straddles more than two cache sets");
				774	}
				775	return Hit;
				776	}
				777	#endif
				778
				779
				780	/* for I1/D1 caches */
				781	#define CACHEUSE(L) \
				782	\
				783	static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
				784	{ \
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	785	UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
				786	UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
				787	UWord tag = a & L.tag_mask; \
				788	UWord tag2; \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	789	int i, j, idx; \
				790	UWord *set, tmp_tag; \
				791	UInt use_mask; \
				792	\
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	793	CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	794	L.name, a, size, set1, set2); \
				795	\
				796	/* First case: word entirely within line. */ \
				797	if (set1 == set2) { \
				798	\
				799	/* Shifting is a bit faster than multiplying */ \
				800	set = &(L.tags[set1 << L.assoc_bits]); \
				801	use_mask = L.line_start_mask[a & L.line_size_mask] & \
				802	L.line_end_mask[(a+size-1) & L.line_size_mask]; \
				803	\
				804	/* This loop is unrolled for just the first case, which is the most */\
				805	/* common. We can't unroll any further because it would screw up */\
				806	/* if we have a direct-mapped (1-way) cache. */\
				807	if (tag == (set[0] & L.tag_mask)) { \
				808	idx = (set1 << L.assoc_bits) \| (set[0] & ~L.tag_mask); \
				809	L.use[idx].count ++; \
				810	L.use[idx].mask \|= use_mask; \
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	811	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	812	idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				813	use_mask, L.use[idx].mask, L.use[idx].count); \
				814	return L1_Hit; \
				815	} \
				816	/* If the tag is one other than the MRU, move it into the MRU spot */\
				817	/* and shuffle the rest down. */\
				818	for (i = 1; i < L.assoc; i++) { \
				819	if (tag == (set[i] & L.tag_mask)) { \
				820	tmp_tag = set[i]; \
				821	for (j = i; j > 0; j--) { \
				822	set[j] = set[j - 1]; \
				823	} \
				824	set[0] = tmp_tag; \
				825	idx = (set1 << L.assoc_bits) \| (tmp_tag & ~L.tag_mask); \
				826	L.use[idx].count ++; \
				827	L.use[idx].mask \|= use_mask; \
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	828	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	829	i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				830	use_mask, L.use[idx].mask, L.use[idx].count); \
				831	return L1_Hit; \
				832	} \
				833	} \
				834	\
				835	/* A miss; install this tag as MRU, shuffle rest down. */ \
				836	tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
				837	for (j = L.assoc - 1; j > 0; j--) { \
				838	set[j] = set[j - 1]; \
				839	} \
				840	set[0] = tag \| tmp_tag; \
				841	idx = (set1 << L.assoc_bits) \| tmp_tag; \
				842	return update_##L##_use(&L, idx, \
				843	use_mask, a &~ L.line_size_mask); \
				844	\
				845	/* Second case: word straddles two lines. */ \
				846	/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
				847	} else if (((set1 + 1) & (L.sets-1)) == set2) { \
				848	Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
				849	set = &(L.tags[set1 << L.assoc_bits]); \
				850	use_mask = L.line_start_mask[a & L.line_size_mask]; \
				851	if (tag == (set[0] & L.tag_mask)) { \
				852	idx = (set1 << L.assoc_bits) \| (set[0] & ~L.tag_mask); \
				853	L.use[idx].count ++; \
				854	L.use[idx].mask \|= use_mask; \
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	855	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	856	idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				857	use_mask, L.use[idx].mask, L.use[idx].count); \
				858	goto block2; \
				859	} \
				860	for (i = 1; i < L.assoc; i++) { \
				861	if (tag == (set[i] & L.tag_mask)) { \
				862	tmp_tag = set[i]; \
				863	for (j = i; j > 0; j--) { \
				864	set[j] = set[j - 1]; \
				865	} \
				866	set[0] = tmp_tag; \
				867	idx = (set1 << L.assoc_bits) \| (tmp_tag & ~L.tag_mask); \
				868	L.use[idx].count ++; \
				869	L.use[idx].mask \|= use_mask; \
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	870	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	871	i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				872	use_mask, L.use[idx].mask, L.use[idx].count); \
				873	goto block2; \
				874	} \
				875	} \
				876	tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
				877	for (j = L.assoc - 1; j > 0; j--) { \
				878	set[j] = set[j - 1]; \
				879	} \
				880	set[0] = tag \| tmp_tag; \
				881	idx = (set1 << L.assoc_bits) \| tmp_tag; \
				882	miss1 = update_##L##_use(&L, idx, \
				883	use_mask, a &~ L.line_size_mask); \
				884	block2: \
				885	set = &(L.tags[set2 << L.assoc_bits]); \
				886	use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	887	tag2 = (a+size-1) & L.tag_mask; \
				888	if (tag2 == (set[0] & L.tag_mask)) { \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	889	idx = (set2 << L.assoc_bits) \| (set[0] & ~L.tag_mask); \
				890	L.use[idx].count ++; \
				891	L.use[idx].mask \|= use_mask; \
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	892	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	893	idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				894	use_mask, L.use[idx].mask, L.use[idx].count); \
				895	return miss1; \
				896	} \
				897	for (i = 1; i < L.assoc; i++) { \
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	898	if (tag2 == (set[i] & L.tag_mask)) { \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	899	tmp_tag = set[i]; \
				900	for (j = i; j > 0; j--) { \
				901	set[j] = set[j - 1]; \
				902	} \
				903	set[0] = tmp_tag; \
				904	idx = (set2 << L.assoc_bits) \| (tmp_tag & ~L.tag_mask); \
				905	L.use[idx].count ++; \
				906	L.use[idx].mask \|= use_mask; \
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	907	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	908	i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				909	use_mask, L.use[idx].mask, L.use[idx].count); \
				910	return miss1; \
				911	} \
				912	} \
				913	tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
				914	for (j = L.assoc - 1; j > 0; j--) { \
				915	set[j] = set[j - 1]; \
				916	} \
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame]	917	set[0] = tag2 \| tmp_tag; \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	918	idx = (set2 << L.assoc_bits) \| tmp_tag; \
				919	miss2 = update_##L##_use(&L, idx, \
				920	use_mask, (a+size-1) &~ L.line_size_mask); \
				921	return (miss1==MemAccess \|\| miss2==MemAccess) ? MemAccess:L2_Hit; \
				922	\
				923	} else { \
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	924	VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	925	VG_(tool_panic)("item straddles more than two cache sets"); \
				926	} \
				927	return 0; \
				928	}
				929
				930
				931	/* logarithmic bitcounting algorithm, see
				932	* http://graphics.stanford.edu/~seander/bithacks.html
				933	*/
				934	static __inline__ unsigned int countBits(unsigned int bits)
				935	{
				936	unsigned int c; // store the total here
				937	const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
				938	const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
				939
				940	c = bits;
				941	c = ((c >> S[0]) & B[0]) + (c & B[0]);
				942	c = ((c >> S[1]) & B[1]) + (c & B[1]);
				943	c = ((c >> S[2]) & B[2]) + (c & B[2]);
				944	c = ((c >> S[3]) & B[3]) + (c & B[3]);
				945	c = ((c >> S[4]) & B[4]) + (c & B[4]);
				946	return c;
				947	}
				948
				949	static void update_L2_use(int idx, Addr memline)
				950	{
				951	line_loaded* loaded = &(L2.loaded[idx]);
				952	line_use* use = &(L2.use[idx]);
				953	int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
				954
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	955	CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	956	idx, bb_base + current_ii->instr_offset, memline);
				957	if (use->count>0) {
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	958	CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	959	use->count, i, use->mask, loaded->memline, loaded->iaddr);
				960	CLG_DEBUG(2, " collect: %d, use_base %p\n",
				961	CLG_(current_state).collect, loaded->use_base);
				962
				963	if (CLG_(current_state).collect && loaded->use_base) {
				964	(loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
				965	(loaded->use_base)[off_L2_SpLoss] += i;
				966	}
				967	}
				968
				969	use->count = 0;
				970	use->mask = 0;
				971
				972	loaded->memline = memline;
				973	loaded->iaddr = bb_base + current_ii->instr_offset;
				974	loaded->use_base = (CLG_(current_state).nonskipped) ?
				975	CLG_(current_state).nonskipped->skipped :
				976	cost_base + current_ii->cost_offset;
				977	}
				978
				979	static
				980	CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
				981	{
				982	UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
				983	UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
				984	UWord tag = memline & L2.tag_mask;
				985
				986	int i, j, idx;
				987	UWord tmp_tag;
				988
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	989	CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	990
				991	if (tag == (set[0] & L2.tag_mask)) {
				992	idx = (setNo << L2.assoc_bits) \| (set[0] & ~L2.tag_mask);
				993	l1_loaded->dep_use = &(L2.use[idx]);
				994
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	995	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	996	idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
				997	L2.use[idx].mask, L2.use[idx].count);
				998	return L2_Hit;
				999	}
				1000	for (i = 1; i < L2.assoc; i++) {
				1001	if (tag == (set[i] & L2.tag_mask)) {
				1002	tmp_tag = set[i];
				1003	for (j = i; j > 0; j--) {
				1004	set[j] = set[j - 1];
				1005	}
				1006	set[0] = tmp_tag;
				1007	idx = (setNo << L2.assoc_bits) \| (tmp_tag & ~L2.tag_mask);
				1008	l1_loaded->dep_use = &(L2.use[idx]);
				1009
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1010	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1011	i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
				1012	L2.use[idx].mask, L2.use[idx].count);
				1013	return L2_Hit;
				1014	}
				1015	}
				1016
				1017	/* A miss; install this tag as MRU, shuffle rest down. */
				1018	tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
				1019	for (j = L2.assoc - 1; j > 0; j--) {
				1020	set[j] = set[j - 1];
				1021	}
				1022	set[0] = tag \| tmp_tag;
				1023	idx = (setNo << L2.assoc_bits) \| tmp_tag;
				1024	l1_loaded->dep_use = &(L2.use[idx]);
				1025
				1026	update_L2_use(idx, memline);
				1027
				1028	return MemAccess;
				1029	}
				1030
				1031
				1032
				1033
				1034	#define UPDATE_USE(L) \
				1035	\
				1036	static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
				1037	UInt mask, Addr memline) \
				1038	{ \
				1039	line_loaded* loaded = &(cache->loaded[idx]); \
				1040	line_use* use = &(cache->use[idx]); \
				1041	int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
				1042	\
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1043	CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1044	cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
				1045	if (use->count>0) { \
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1046	CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1047	use->count, c, use->mask, loaded->memline, loaded->iaddr); \
				1048	CLG_DEBUG(2, " collect: %d, use_base %p\n", \
				1049	CLG_(current_state).collect, loaded->use_base); \
				1050	\
				1051	if (CLG_(current_state).collect && loaded->use_base) { \
				1052	(loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
				1053	(loaded->use_base)[off_##L##_SpLoss] += c; \
				1054	\
				1055	/* FIXME (?): L1/L2 line sizes must be equal ! */ \
				1056	loaded->dep_use->mask \|= use->mask; \
				1057	loaded->dep_use->count += use->count; \
				1058	} \
				1059	} \
				1060	\
				1061	use->count = 1; \
				1062	use->mask = mask; \
				1063	loaded->memline = memline; \
				1064	loaded->iaddr = bb_base + current_ii->instr_offset; \
				1065	loaded->use_base = (CLG_(current_state).nonskipped) ? \
				1066	CLG_(current_state).nonskipped->skipped : \
				1067	cost_base + current_ii->cost_offset; \
				1068	\
				1069	if (memline == 0) return L2_Hit; \
				1070	return cacheuse_L2_access(memline, loaded); \
				1071	}
				1072
				1073	UPDATE_USE(I1);
				1074	UPDATE_USE(D1);
				1075
				1076	CACHEUSE(I1);
				1077	CACHEUSE(D1);
				1078
				1079
				1080	static
				1081	void cacheuse_finish(void)
				1082	{
				1083	int i;
				1084	InstrInfo ii = { 0,0,0,0,0 };
				1085
				1086	if (!CLG_(current_state).collect) return;
				1087
				1088	bb_base = 0;
				1089	current_ii = &ii;
				1090	cost_base = 0;
				1091
				1092	/* update usage counters */
				1093	if (I1.use)
				1094	for (i = 0; i < I1.sets * I1.assoc; i++)
				1095	if (I1.loaded[i].use_base)
				1096	update_I1_use( &I1, i, 0,0);
				1097
				1098	if (D1.use)
				1099	for (i = 0; i < D1.sets * D1.assoc; i++)
				1100	if (D1.loaded[i].use_base)
				1101	update_D1_use( &D1, i, 0,0);
				1102
				1103	if (L2.use)
				1104	for (i = 0; i < L2.sets * L2.assoc; i++)
				1105	if (L2.loaded[i].use_base)
				1106	update_L2_use(i, 0);
				1107	}
				1108
				1109
				1110
				1111	/------------------------------------------------------------/
				1112	/--- Helper functions called by instrumented code ---/
				1113	/------------------------------------------------------------/
				1114
				1115
				1116	static __inline__
				1117	void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
				1118	{
				1119	switch(r) {
				1120	case WriteBackMemAccess:
				1121	if (clo_simulate_writeback) {
				1122	c1[3]++;
				1123	c2[3]++;
				1124	}
				1125	// fall through
				1126
				1127	case MemAccess:
				1128	c1[2]++;
				1129	c2[2]++;
				1130	// fall through
				1131
				1132	case L2_Hit:
				1133	c1[1]++;
				1134	c2[1]++;
				1135	// fall through
				1136
				1137	default:
				1138	c1[0]++;
				1139	c2[0]++;
				1140	}
				1141	}
				1142
				1143
				1144	VG_REGPARM(1)
				1145	static void log_1I0D(InstrInfo* ii)
				1146	{
				1147	CacheModelResult IrRes;
				1148
				1149	current_ii = ii;
				1150	IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
				1151
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1152	CLG_DEBUG(6, "log_1I0D: Ir=%#lx/%u => Ir %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1153	bb_base + ii->instr_offset, ii->instr_size, IrRes);
				1154
				1155	if (CLG_(current_state).collect) {
				1156	ULong* cost_Ir;
				1157
				1158	if (CLG_(current_state).nonskipped)
				1159	cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
				1160	else
				1161	cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
				1162
				1163	inc_costs(IrRes, cost_Ir,
				1164	CLG_(current_state).cost + CLG_(sets).off_full_Ir );
				1165	}
				1166	}
				1167
				1168
				1169	/* Instruction doing a read access */
				1170
				1171	VG_REGPARM(2)
				1172	static void log_1I1Dr(InstrInfo* ii, Addr data)
				1173	{
				1174	CacheModelResult IrRes, DrRes;
				1175
				1176	current_ii = ii;
				1177	IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
				1178	DrRes = (*simulator.D1_Read)(data, ii->data_size);
				1179
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1180	CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1181	bb_base + ii->instr_offset, ii->instr_size,
				1182	data, ii->data_size, IrRes, DrRes);
				1183
				1184	if (CLG_(current_state).collect) {
				1185	ULong cost_Ir, cost_Dr;
				1186
				1187	if (CLG_(current_state).nonskipped) {
				1188	cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
				1189	cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
				1190	}
				1191	else {
				1192	cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
				1193	cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
				1194	}
				1195
				1196	inc_costs(IrRes, cost_Ir,
				1197	CLG_(current_state).cost + CLG_(sets).off_full_Ir );
				1198	inc_costs(DrRes, cost_Dr,
				1199	CLG_(current_state).cost + CLG_(sets).off_full_Dr );
				1200	}
				1201	}
				1202
				1203
				1204	VG_REGPARM(2)
				1205	static void log_0I1Dr(InstrInfo* ii, Addr data)
				1206	{
				1207	CacheModelResult DrRes;
				1208
				1209	current_ii = ii;
				1210	DrRes = (*simulator.D1_Read)(data, ii->data_size);
				1211
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1212	CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1213	data, ii->data_size, DrRes);
				1214
				1215	if (CLG_(current_state).collect) {
				1216	ULong *cost_Dr;
				1217
				1218	if (CLG_(current_state).nonskipped) {
				1219	cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
				1220	}
				1221	else {
				1222	cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
				1223	}
				1224
				1225	inc_costs(DrRes, cost_Dr,
				1226	CLG_(current_state).cost + CLG_(sets).off_full_Dr );
				1227	}
				1228	}
				1229
				1230
				1231	/* Instruction doing a write access */
				1232
				1233	VG_REGPARM(2)
				1234	static void log_1I1Dw(InstrInfo* ii, Addr data)
				1235	{
				1236	CacheModelResult IrRes, DwRes;
				1237
				1238	current_ii = ii;
				1239	IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
				1240	DwRes = (*simulator.D1_Write)(data, ii->data_size);
				1241
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1242	CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1243	bb_base + ii->instr_offset, ii->instr_size,
				1244	data, ii->data_size, IrRes, DwRes);
				1245
				1246	if (CLG_(current_state).collect) {
				1247	ULong cost_Ir, cost_Dw;
				1248
				1249	if (CLG_(current_state).nonskipped) {
				1250	cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
				1251	cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
				1252	}
				1253	else {
				1254	cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
				1255	cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
				1256	}
				1257
				1258	inc_costs(IrRes, cost_Ir,
				1259	CLG_(current_state).cost + CLG_(sets).off_full_Ir );
				1260	inc_costs(DwRes, cost_Dw,
				1261	CLG_(current_state).cost + CLG_(sets).off_full_Dw );
				1262	}
				1263	}
				1264
				1265	VG_REGPARM(2)
				1266	static void log_0I1Dw(InstrInfo* ii, Addr data)
				1267	{
				1268	CacheModelResult DwRes;
				1269
				1270	current_ii = ii;
				1271	DwRes = (*simulator.D1_Write)(data, ii->data_size);
				1272
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1273	CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1274	data, ii->data_size, DwRes);
				1275
				1276	if (CLG_(current_state).collect) {
				1277	ULong *cost_Dw;
				1278
				1279	if (CLG_(current_state).nonskipped) {
				1280	cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
				1281	}
				1282	else {
				1283	cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
				1284	}
				1285
				1286	inc_costs(DwRes, cost_Dw,
				1287	CLG_(current_state).cost + CLG_(sets).off_full_Dw );
				1288	}
				1289	}
				1290
				1291	/* Instruction doing a read and a write access */
				1292
				1293	VG_REGPARM(3)
				1294	static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
				1295	{
				1296	CacheModelResult IrRes, DrRes, DwRes;
				1297
				1298	current_ii = ii;
				1299	IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
				1300	DrRes = (*simulator.D1_Read)(data1, ii->data_size);
				1301	DwRes = (*simulator.D1_Write)(data2, ii->data_size);
				1302
				1303	CLG_DEBUG(6,
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1304	"log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1305	bb_base + ii->instr_offset, ii->instr_size,
				1306	data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
				1307
				1308	if (CLG_(current_state).collect) {
				1309	ULong cost_Ir, cost_Dr, *cost_Dw;
				1310
				1311	if (CLG_(current_state).nonskipped) {
				1312	cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
				1313	cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
				1314	cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
				1315	}
				1316	else {
				1317	cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
				1318	cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
				1319	cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
				1320	}
				1321
				1322	inc_costs(IrRes, cost_Ir,
				1323	CLG_(current_state).cost + CLG_(sets).off_full_Ir );
				1324	inc_costs(DrRes, cost_Dr,
				1325	CLG_(current_state).cost + CLG_(sets).off_full_Dr );
				1326	inc_costs(DwRes, cost_Dw,
				1327	CLG_(current_state).cost + CLG_(sets).off_full_Dw );
				1328	}
				1329	}
				1330
				1331	VG_REGPARM(3)
				1332	static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
				1333	{
				1334	CacheModelResult DrRes, DwRes;
				1335
				1336	current_ii = ii;
				1337	DrRes = (*simulator.D1_Read)(data1, ii->data_size);
				1338	DwRes = (*simulator.D1_Write)(data2, ii->data_size);
				1339
				1340	CLG_DEBUG(6,
bart	a0b6b2c	2008-07-07 06:49:24 +0000	[diff] [blame^]	1341	"log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1342	data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
				1343
				1344	if (CLG_(current_state).collect) {
				1345	ULong cost_Dr, cost_Dw;
				1346
				1347	if (CLG_(current_state).nonskipped) {
				1348	cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
				1349	cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
				1350	}
				1351	else {
				1352	cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
				1353	cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
				1354	}
				1355
				1356	inc_costs(DrRes, cost_Dr,
				1357	CLG_(current_state).cost + CLG_(sets).off_full_Dr );
				1358	inc_costs(DwRes, cost_Dw,
				1359	CLG_(current_state).cost + CLG_(sets).off_full_Dw );
				1360	}
				1361	}
				1362
				1363
				1364	/------------------------------------------------------------/
				1365	/--- Cache configuration ---/
				1366	/------------------------------------------------------------/
				1367
				1368	#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
				1369
				1370	static cache_t clo_I1_cache = UNDEFINED_CACHE;
				1371	static cache_t clo_D1_cache = UNDEFINED_CACHE;
				1372	static cache_t clo_L2_cache = UNDEFINED_CACHE;
				1373
				1374
				1375	/* Checks cache config is ok; makes it so if not. */
				1376	static
				1377	void check_cache(cache_t* cache, Char *name)
				1378	{
				1379	/* First check they're all powers of two */
				1380	if (-1 == VG_(log2)(cache->size)) {
				1381	VG_(message)(Vg_UserMsg,
				1382	"error: %s size of %dB not a power of two; aborting.",
				1383	name, cache->size);
				1384	VG_(exit)(1);
				1385	}
				1386
				1387	if (-1 == VG_(log2)(cache->assoc)) {
				1388	VG_(message)(Vg_UserMsg,
				1389	"error: %s associativity of %d not a power of two; aborting.",
				1390	name, cache->assoc);
				1391	VG_(exit)(1);
				1392	}
				1393
				1394	if (-1 == VG_(log2)(cache->line_size)) {
				1395	VG_(message)(Vg_UserMsg,
				1396	"error: %s line size of %dB not a power of two; aborting.",
				1397	name, cache->line_size);
				1398	VG_(exit)(1);
				1399	}
				1400
				1401	// Then check line size >= 16 -- any smaller and a single instruction could
				1402	// straddle three cache lines, which breaks a simulation assertion and is
				1403	// stupid anyway.
				1404	if (cache->line_size < MIN_LINE_SIZE) {
				1405	VG_(message)(Vg_UserMsg,
				1406	"error: %s line size of %dB too small; aborting.",
				1407	name, cache->line_size);
				1408	VG_(exit)(1);
				1409	}
				1410
				1411	/* Then check cache size > line size (causes seg faults if not). */
				1412	if (cache->size <= cache->line_size) {
				1413	VG_(message)(Vg_UserMsg,
				1414	"error: %s cache size of %dB <= line size of %dB; aborting.",
				1415	name, cache->size, cache->line_size);
				1416	VG_(exit)(1);
				1417	}
				1418
				1419	/* Then check assoc <= (size / line size) (seg faults otherwise). */
				1420	if (cache->assoc > (cache->size / cache->line_size)) {
				1421	VG_(message)(Vg_UserMsg,
				1422	"warning: %s associativity > (size / line size); aborting.", name);
				1423	VG_(exit)(1);
				1424	}
				1425	}
				1426
				1427	static
				1428	void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
				1429	{
				1430	#define DEFINED(L) (-1 != L.size \|\| -1 != L.assoc \|\| -1 != L.line_size)
				1431
				1432	Int n_clos = 0;
				1433
				1434	// Count how many were defined on the command line.
				1435	if (DEFINED(clo_I1_cache)) { n_clos++; }
				1436	if (DEFINED(clo_D1_cache)) { n_clos++; }
				1437	if (DEFINED(clo_L2_cache)) { n_clos++; }
				1438
				1439	// Set the cache config (using auto-detection, if supported by the
				1440	// architecture)
				1441	VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
				1442
				1443	// Then replace with any defined on the command line.
				1444	if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
				1445	if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
				1446	if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
				1447
				1448	// Then check values and fix if not acceptable.
				1449	check_cache(I1c, "I1");
				1450	check_cache(D1c, "D1");
				1451	check_cache(L2c, "L2");
				1452
				1453	if (VG_(clo_verbosity) > 1) {
				1454	VG_(message)(Vg_UserMsg, "Cache configuration used:");
				1455	VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
				1456	I1c->size, I1c->assoc, I1c->line_size);
				1457	VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
				1458	D1c->size, D1c->assoc, D1c->line_size);
				1459	VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
				1460	L2c->size, L2c->assoc, L2c->line_size);
				1461	}
				1462	#undef CMD_LINE_DEFINED
				1463	}
				1464
				1465
				1466	/* Initialize and clear simulator state */
				1467	static void cachesim_post_clo_init(void)
				1468	{
				1469	/* Cache configurations. */
				1470	cache_t I1c, D1c, L2c;
				1471
				1472	/* Initialize access handlers */
				1473	if (!CLG_(clo).simulate_cache) {
				1474	CLG_(cachesim).log_1I0D = 0;
				1475	CLG_(cachesim).log_1I0D_name = "(no function)";
				1476
				1477	CLG_(cachesim).log_1I1Dr = 0;
				1478	CLG_(cachesim).log_1I1Dw = 0;
				1479	CLG_(cachesim).log_1I2D = 0;
				1480	CLG_(cachesim).log_1I1Dr_name = "(no function)";
				1481	CLG_(cachesim).log_1I1Dw_name = "(no function)";
				1482	CLG_(cachesim).log_1I2D_name = "(no function)";
				1483
				1484	CLG_(cachesim).log_0I1Dr = 0;
				1485	CLG_(cachesim).log_0I1Dw = 0;
				1486	CLG_(cachesim).log_0I2D = 0;
				1487	CLG_(cachesim).log_0I1Dr_name = "(no function)";
				1488	CLG_(cachesim).log_0I1Dw_name = "(no function)";
				1489	CLG_(cachesim).log_0I2D_name = "(no function)";
				1490	return;
				1491	}
				1492
				1493	/* Configuration of caches only needed with real cache simulation */
				1494	configure_caches(&I1c, &D1c, &L2c);
				1495
				1496	I1.name = "I1";
				1497	D1.name = "D1";
				1498	L2.name = "L2";
				1499
				1500	cachesim_initcache(I1c, &I1);
				1501	cachesim_initcache(D1c, &D1);
				1502	cachesim_initcache(L2c, &L2);
				1503
				1504	/* the other cache simulators use the standard helpers
				1505	* with dispatching via simulator struct */
				1506
				1507	CLG_(cachesim).log_1I0D = log_1I0D;
				1508	CLG_(cachesim).log_1I0D_name = "log_1I0D";
				1509
				1510	CLG_(cachesim).log_1I1Dr = log_1I1Dr;
				1511	CLG_(cachesim).log_1I1Dw = log_1I1Dw;
				1512	CLG_(cachesim).log_1I2D = log_1I2D;
				1513	CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
				1514	CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
				1515	CLG_(cachesim).log_1I2D_name = "log_1I2D";
				1516
				1517	CLG_(cachesim).log_0I1Dr = log_0I1Dr;
				1518	CLG_(cachesim).log_0I1Dw = log_0I1Dw;
				1519	CLG_(cachesim).log_0I2D = log_0I2D;
				1520	CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
				1521	CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
				1522	CLG_(cachesim).log_0I2D_name = "log_0I2D";
				1523
				1524	if (clo_collect_cacheuse) {
				1525
				1526	/* Output warning for not supported option combinations */
				1527	if (clo_simulate_hwpref) {
				1528	VG_(message)(Vg_DebugMsg,
				1529	"warning: prefetch simulation can not be used with cache usage");
				1530	clo_simulate_hwpref = False;
				1531	}
				1532
				1533	if (clo_simulate_writeback) {
				1534	VG_(message)(Vg_DebugMsg,
				1535	"warning: write-back simulation can not be used with cache usage");
				1536	clo_simulate_writeback = False;
				1537	}
				1538
				1539	simulator.I1_Read = cacheuse_I1_doRead;
				1540	simulator.D1_Read = cacheuse_D1_doRead;
				1541	simulator.D1_Write = cacheuse_D1_doRead;
				1542	return;
				1543	}
				1544
				1545	if (clo_simulate_hwpref) {
				1546	prefetch_clear();
				1547
				1548	if (clo_simulate_writeback) {
				1549	simulator.I1_Read = prefetch_I1_Read;
				1550	simulator.D1_Read = prefetch_D1_Read;
				1551	simulator.D1_Write = prefetch_D1_Write;
				1552	}
				1553	else {
				1554	simulator.I1_Read = prefetch_I1_ref;
				1555	simulator.D1_Read = prefetch_D1_ref;
				1556	simulator.D1_Write = prefetch_D1_ref;
				1557	}
				1558
				1559	return;
				1560	}
				1561
				1562	if (clo_simulate_writeback) {
				1563	simulator.I1_Read = cachesim_I1_Read;
				1564	simulator.D1_Read = cachesim_D1_Read;
				1565	simulator.D1_Write = cachesim_D1_Write;
				1566	}
				1567	else {
				1568	simulator.I1_Read = cachesim_I1_ref;
				1569	simulator.D1_Read = cachesim_D1_ref;
				1570	simulator.D1_Write = cachesim_D1_ref;
				1571	}
				1572	}
				1573
				1574
				1575	/* Clear simulator state. Has to be initialized before */
				1576	static
				1577	void cachesim_clear(void)
				1578	{
				1579	cachesim_clearcache(&I1);
				1580	cachesim_clearcache(&D1);
				1581	cachesim_clearcache(&L2);
				1582
				1583	prefetch_clear();
				1584	}
				1585
				1586
				1587	static void cachesim_getdesc(Char* buf)
				1588	{
				1589	Int p;
				1590	p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
				1591	p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
				1592	VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
				1593	}
				1594
				1595	static
				1596	void cachesim_print_opts(void)
				1597	{
				1598	VG_(printf)(
				1599	"\n cache simulator options:\n"
				1600	" --simulate-cache=no\|yes Do cache simulation [no]\n"
				1601	" --simulate-wb=no\|yes Count write-back events [no]\n"
				1602	" --simulate-hwpref=no\|yes Simulate hardware prefetch [no]\n"
				1603	#if CLG_EXPERIMENTAL
				1604	" --simulate-sectors=no\|yes Simulate sectored behaviour [no]\n"
				1605	#endif
				1606	" --cacheuse=no\|yes Collect cache block use [no]\n"
				1607	" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
				1608	" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
				1609	" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
				1610	);
				1611	}
				1612
				1613	static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
				1614	{
				1615	int i1, i2, i3;
				1616	int i;
				1617	char *opt = VG_(strdup)(orig_opt);
				1618
				1619	i = i1 = opt_len;
				1620
				1621	/* Option looks like "--I1=65536,2,64".
				1622	* Find commas, replace with NULs to make three independent
				1623	* strings, then extract numbers. Yuck. */
				1624	while (VG_(isdigit)(opt[i])) i++;
				1625	if (',' == opt[i]) {
				1626	opt[i++] = '\0';
				1627	i2 = i;
				1628	} else goto bad;
				1629	while (VG_(isdigit)(opt[i])) i++;
				1630	if (',' == opt[i]) {
				1631	opt[i++] = '\0';
				1632	i3 = i;
				1633	} else goto bad;
				1634	while (VG_(isdigit)(opt[i])) i++;
				1635	if ('\0' != opt[i]) goto bad;
				1636
				1637	cache->size = (Int)VG_(atoll)(opt + i1);
				1638	cache->assoc = (Int)VG_(atoll)(opt + i2);
				1639	cache->line_size = (Int)VG_(atoll)(opt + i3);
				1640
				1641	VG_(free)(opt);
				1642
				1643	return;
				1644
				1645	bad:
sewardj	6893d65	2006-10-15 01:25:13 +0000	[diff] [blame]	1646	VG_(err_bad_option)(orig_opt);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1647	}
				1648
				1649	/* Check for command line option for cache configuration.
				1650	* Return False if unknown and not handled.
				1651	*
				1652	* Called from CLG_(process_cmd_line_option)() in clo.c
				1653	*/
				1654	static Bool cachesim_parse_opt(Char* arg)
				1655	{
				1656	if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
				1657	clo_simulate_writeback = True;
				1658	else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
				1659	clo_simulate_writeback = False;
				1660
				1661	else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
				1662	clo_simulate_hwpref = True;
				1663	else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
				1664	clo_simulate_hwpref = False;
				1665
				1666	else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
				1667	clo_simulate_sectors = True;
				1668	else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
				1669	clo_simulate_sectors = False;
				1670
				1671	else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
				1672	clo_collect_cacheuse = True;
				1673	/* Use counters only make sense with fine dumping */
				1674	CLG_(clo).dump_instr = True;
				1675	}
				1676	else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
				1677	clo_collect_cacheuse = False;
				1678
				1679	/* 5 is length of "--I1=" */
				1680	else if (0 == VG_(strncmp)(arg, "--I1=", 5))
				1681	parse_opt(&clo_I1_cache, arg, 5);
				1682	else if (0 == VG_(strncmp)(arg, "--D1=", 5))
				1683	parse_opt(&clo_D1_cache, arg, 5);
				1684	else if (0 == VG_(strncmp)(arg, "--L2=", 5))
				1685	parse_opt(&clo_L2_cache, arg, 5);
				1686	else
				1687	return False;
				1688
				1689	return True;
				1690	}
				1691
				1692	/* Adds commas to ULong, right justifying in a field field_width wide, returns
				1693	* the string in buf. */
				1694	static
				1695	Int commify(ULong n, int field_width, char* buf)
				1696	{
				1697	int len, n_commas, i, j, new_len, space;
				1698
				1699	VG_(sprintf)(buf, "%llu", n);
				1700	len = VG_(strlen)(buf);
				1701	n_commas = (len - 1) / 3;
				1702	new_len = len + n_commas;
				1703	space = field_width - new_len;
				1704
				1705	/* Allow for printing a number in a field_width smaller than it's size */
				1706	if (space < 0) space = 0;
				1707
				1708	/* Make j = -1 because we copy the '\0' before doing the numbers in groups
				1709	* of three. */
				1710	for (j = -1, i = len ; i >= 0; i--) {
				1711	buf[i + n_commas + space] = buf[i];
				1712
				1713	if ((i>0) && (3 == ++j)) {
				1714	j = 0;
				1715	n_commas--;
				1716	buf[i + n_commas + space] = ',';
				1717	}
				1718	}
				1719	/* Right justify in field. */
				1720	for (i = 0; i < space; i++) buf[i] = ' ';
				1721	return new_len;
				1722	}
				1723
				1724	static
				1725	void percentify(Int n, Int ex, Int field_width, char buf[])
				1726	{
				1727	int i, len, space;
				1728
				1729	VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
				1730	len = VG_(strlen)(buf);
				1731	space = field_width - len;
				1732	if (space < 0) space = 0; /* Allow for v. small field_width */
				1733	i = len;
				1734
				1735	/* Right justify in field */
				1736	for ( ; i >= 0; i--) buf[i + space] = buf[i];
				1737	for (i = 0; i < space; i++) buf[i] = ' ';
				1738	}
				1739
				1740	static
				1741	void cachesim_printstat(void)
				1742	{
				1743	FullCost total = CLG_(total_cost), D_total = 0;
				1744	ULong L2_total_m, L2_total_mr, L2_total_mw,
				1745	L2_total, L2_total_r, L2_total_w;
				1746	char buf1[RESULTS_BUF_LEN],
				1747	buf2[RESULTS_BUF_LEN],
				1748	buf3[RESULTS_BUF_LEN];
				1749	Int l1, l2, l3;
				1750	Int p;
				1751
				1752	if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
				1753	VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
				1754	prefetch_up);
				1755	VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
				1756	prefetch_down);
				1757	VG_(message)(Vg_DebugMsg, "");
				1758	}
				1759
				1760	/* I cache results. Use the I_refs value to determine the first column
				1761	* width. */
				1762	l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
				1763	VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
				1764
				1765	if (!CLG_(clo).simulate_cache) return;
				1766
				1767	commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
				1768	VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
				1769
				1770	commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
				1771	VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
				1772
				1773	p = 100;
				1774
				1775	if (0 == total[CLG_(sets).off_full_Ir])
				1776	total[CLG_(sets).off_full_Ir] = 1;
				1777
				1778	percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
				1779	total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
				1780	VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
				1781
				1782	percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
				1783	total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
				1784	VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
				1785	VG_(message)(Vg_UserMsg, "");
				1786
				1787	/* D cache results.
				1788	Use the D_refs.rd and D_refs.wr values to determine the
				1789	* width of columns 2 & 3. */
				1790
				1791	D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
				1792	CLG_(init_cost)( CLG_(sets).full, D_total);
				1793	CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
				1794	CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
				1795
				1796	commify( D_total[0], l1, buf1);
				1797	l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
				1798	l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
				1799	VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
				1800	buf1, buf2, buf3);
				1801
				1802	commify( D_total[1], l1, buf1);
				1803	commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
				1804	commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
				1805	VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
				1806	buf1, buf2, buf3);
				1807
				1808	commify( D_total[2], l1, buf1);
				1809	commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
				1810	commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
				1811	VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
				1812	buf1, buf2, buf3);
				1813
				1814	p = 10;
				1815
				1816	if (0 == D_total[0]) D_total[0] = 1;
				1817	if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
				1818	if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
				1819
				1820	percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
				1821	percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
				1822	total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
				1823	percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
				1824	total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
				1825	VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
				1826
				1827	percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
				1828	percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
				1829	total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
				1830	percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
				1831	total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
				1832	VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
				1833	VG_(message)(Vg_UserMsg, "");
				1834
				1835
				1836
				1837	/* L2 overall results */
				1838
				1839	L2_total =
				1840	total[CLG_(sets).off_full_Dr +1] +
				1841	total[CLG_(sets).off_full_Dw +1] +
				1842	total[CLG_(sets).off_full_Ir +1];
				1843	L2_total_r =
				1844	total[CLG_(sets).off_full_Dr +1] +
				1845	total[CLG_(sets).off_full_Ir +1];
				1846	L2_total_w = total[CLG_(sets).off_full_Dw +1];
				1847	commify(L2_total, l1, buf1);
				1848	commify(L2_total_r, l2, buf2);
				1849	commify(L2_total_w, l3, buf3);
				1850	VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
				1851	buf1, buf2, buf3);
				1852
				1853	L2_total_m =
				1854	total[CLG_(sets).off_full_Dr +2] +
				1855	total[CLG_(sets).off_full_Dw +2] +
				1856	total[CLG_(sets).off_full_Ir +2];
				1857	L2_total_mr =
				1858	total[CLG_(sets).off_full_Dr +2] +
				1859	total[CLG_(sets).off_full_Ir +2];
				1860	L2_total_mw = total[CLG_(sets).off_full_Dw +2];
				1861	commify(L2_total_m, l1, buf1);
				1862	commify(L2_total_mr, l2, buf2);
				1863	commify(L2_total_mw, l3, buf3);
				1864	VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
				1865	buf1, buf2, buf3);
				1866
				1867	percentify(L2_total_m * 100 * p /
				1868	(total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
				1869	percentify(L2_total_mr * 100 * p /
				1870	(total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
				1871	p, l2+1, buf2);
				1872	percentify(L2_total_mw * 100 * p /
				1873	total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
				1874	VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
				1875	buf1, buf2,buf3);
				1876	}
				1877
				1878
				1879	/------------------------------------------------------------/
				1880	/--- Setup for Event set. ---/
				1881	/------------------------------------------------------------/
				1882
				1883	struct event_sets CLG_(sets);
				1884
				1885	void CLG_(init_eventsets)(Int max_user)
				1886	{
				1887	EventType * e1, e2, e3, *e4;
				1888	EventSet Ir, Dr, *Dw;
				1889	EventSet D0, D1r, D1w, D2;
				1890	EventSet sim, full;
				1891	EventSet *use;
				1892	int sizeOfUseIr;
				1893
				1894	use = CLG_(get_eventset)("Use", 4);
				1895	if (clo_collect_cacheuse) {
				1896	/* if TUse is 0, there was never a load, and no loss, too */
				1897	e1 = CLG_(register_eventtype)("AcCost1");
				1898	CLG_(add_eventtype)(use, e1);
				1899	e1 = CLG_(register_eventtype)("SpLoss1");
				1900	CLG_(add_eventtype)(use, e1);
				1901	e1 = CLG_(register_eventtype)("AcCost2");
				1902	CLG_(add_eventtype)(use, e1);
				1903	e1 = CLG_(register_eventtype)("SpLoss2");
				1904	CLG_(add_eventtype)(use, e1);
				1905	}
				1906
				1907	Ir = CLG_(get_eventset)("Ir", 4);
				1908	Dr = CLG_(get_eventset)("Dr", 4);
				1909	Dw = CLG_(get_eventset)("Dw", 4);
				1910	if (CLG_(clo).simulate_cache) {
				1911	e1 = CLG_(register_eventtype)("Ir");
				1912	e2 = CLG_(register_eventtype)("I1mr");
				1913	e3 = CLG_(register_eventtype)("I2mr");
				1914	if (clo_simulate_writeback) {
				1915	e4 = CLG_(register_eventtype)("I2dmr");
				1916	CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
				1917	}
				1918	else
				1919	CLG_(add_dep_event3)(Ir, e1,e2,e3);
				1920
				1921	e1 = CLG_(register_eventtype)("Dr");
				1922	e2 = CLG_(register_eventtype)("D1mr");
				1923	e3 = CLG_(register_eventtype)("D2mr");
				1924	if (clo_simulate_writeback) {
				1925	e4 = CLG_(register_eventtype)("D2dmr");
				1926	CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
				1927	}
				1928	else
				1929	CLG_(add_dep_event3)(Dr, e1,e2,e3);
				1930
				1931	e1 = CLG_(register_eventtype)("Dw");
				1932	e2 = CLG_(register_eventtype)("D1mw");
				1933	e3 = CLG_(register_eventtype)("D2mw");
				1934	if (clo_simulate_writeback) {
				1935	e4 = CLG_(register_eventtype)("D2dmw");
				1936	CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
				1937	}
				1938	else
				1939	CLG_(add_dep_event3)(Dw, e1,e2,e3);
				1940
				1941	}
				1942	else {
				1943	e1 = CLG_(register_eventtype)("Ir");
				1944	CLG_(add_eventtype)(Ir, e1);
				1945	}
				1946
				1947	sizeOfUseIr = use->size + Ir->size;
				1948	D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
				1949	CLG_(add_eventset)(D0, use);
				1950	off_D0_Ir = CLG_(add_eventset)(D0, Ir);
				1951
				1952	D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
				1953	CLG_(add_eventset)(D1r, use);
				1954	off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
				1955	off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
				1956
				1957	D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
				1958	CLG_(add_eventset)(D1w, use);
				1959	off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
				1960	off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
				1961
				1962	D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
				1963	CLG_(add_eventset)(D2, use);
				1964	off_D2_Ir = CLG_(add_eventset)(D2, Ir);
				1965	off_D2_Dr = CLG_(add_eventset)(D2, Dr);
				1966	off_D2_Dw = CLG_(add_eventset)(D2, Dw);
				1967
				1968	sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
				1969	CLG_(add_eventset)(sim, use);
				1970	CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
				1971	CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
				1972	CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
				1973
				1974	if (CLG_(clo).collect_alloc) max_user += 2;
				1975	if (CLG_(clo).collect_systime) max_user += 2;
				1976
				1977	full = CLG_(get_eventset)("full", sim->size + max_user);
				1978	CLG_(add_eventset)(full, sim);
				1979	CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
				1980	CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
				1981	CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
				1982
				1983	CLG_(sets).use = use;
				1984	CLG_(sets).Ir = Ir;
				1985	CLG_(sets).Dr = Dr;
				1986	CLG_(sets).Dw = Dw;
				1987
				1988	CLG_(sets).D0 = D0;
				1989	CLG_(sets).D1r = D1r;
				1990	CLG_(sets).D1w = D1w;
				1991	CLG_(sets).D2 = D2;
				1992
				1993	CLG_(sets).sim = sim;
				1994	CLG_(sets).full = full;
				1995
				1996	if (CLG_(clo).collect_alloc) {
				1997	e1 = CLG_(register_eventtype)("allocCount");
				1998	e2 = CLG_(register_eventtype)("allocSize");
				1999	CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
				2000	}
				2001
				2002	if (CLG_(clo).collect_systime) {
				2003	e1 = CLG_(register_eventtype)("sysCount");
				2004	e2 = CLG_(register_eventtype)("sysTime");
				2005	CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
				2006	}
				2007
				2008	CLG_DEBUGIF(1) {
				2009	CLG_DEBUG(1, "EventSets:\n");
				2010	CLG_(print_eventset)(-2, use);
				2011	CLG_(print_eventset)(-2, Ir);
				2012	CLG_(print_eventset)(-2, Dr);
				2013	CLG_(print_eventset)(-2, Dw);
				2014	CLG_(print_eventset)(-2, sim);
				2015	CLG_(print_eventset)(-2, full);
				2016	}
				2017
				2018	/* Not-existing events are silently ignored */
				2019	CLG_(dumpmap) = CLG_(get_eventmapping)(full);
				2020	CLG_(append_event)(CLG_(dumpmap), "Ir");
				2021	CLG_(append_event)(CLG_(dumpmap), "Dr");
				2022	CLG_(append_event)(CLG_(dumpmap), "Dw");
				2023	CLG_(append_event)(CLG_(dumpmap), "I1mr");
				2024	CLG_(append_event)(CLG_(dumpmap), "D1mr");
				2025	CLG_(append_event)(CLG_(dumpmap), "D1mw");
				2026	CLG_(append_event)(CLG_(dumpmap), "I2mr");
				2027	CLG_(append_event)(CLG_(dumpmap), "D2mr");
				2028	CLG_(append_event)(CLG_(dumpmap), "D2mw");
				2029	CLG_(append_event)(CLG_(dumpmap), "I2dmr");
				2030	CLG_(append_event)(CLG_(dumpmap), "D2dmr");
				2031	CLG_(append_event)(CLG_(dumpmap), "D2dmw");
				2032	CLG_(append_event)(CLG_(dumpmap), "AcCost1");
				2033	CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
				2034	CLG_(append_event)(CLG_(dumpmap), "AcCost2");
				2035	CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
				2036	CLG_(append_event)(CLG_(dumpmap), "allocCount");
				2037	CLG_(append_event)(CLG_(dumpmap), "allocSize");
				2038	CLG_(append_event)(CLG_(dumpmap), "sysCount");
				2039	CLG_(append_event)(CLG_(dumpmap), "sysTime");
				2040
				2041	}
				2042
				2043
				2044
				2045	static
				2046	void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
				2047	{
				2048	/* if eventset use is defined, it is always first (hardcoded!) */
				2049	CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
				2050
				2051	/* FIXME: This is hardcoded... */
				2052	if (es == CLG_(sets).D0) {
				2053	CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
				2054	cost + off_D0_Ir);
				2055	}
				2056	else if (es == CLG_(sets).D1r) {
				2057	CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
				2058	cost + off_D1r_Ir);
				2059	CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
				2060	cost + off_D1r_Dr);
				2061	}
				2062	else if (es == CLG_(sets).D1w) {
				2063	CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
				2064	cost + off_D1w_Ir);
				2065	CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
				2066	cost + off_D1w_Dw);
				2067	}
				2068	else {
				2069	CLG_ASSERT(es == CLG_(sets).D2);
				2070	CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
				2071	cost + off_D2_Ir);
				2072	CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
				2073	cost + off_D2_Dr);
				2074	CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
				2075	cost + off_D2_Dw);
				2076	}
				2077	}
				2078
				2079	/* this is called at dump time for every instruction executed */
				2080	static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
				2081	InstrInfo* ii, ULong exe_count)
				2082	{
				2083	if (!CLG_(clo).simulate_cache)
				2084	cost[CLG_(sets).off_sim_Ir] += exe_count;
				2085	else {
				2086
				2087	#if 0
				2088	/* There is always a trivial case where exe_count and Ir can be
				2089	* slightly different because ecounter is updated when executing
				2090	* the next BB. E.g. for last BB executed, or when toggling collection
				2091	*/
				2092	/* FIXME: Hardcoded that each eventset has Ir as first */
				2093	if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
				2094	VG_(printf)("==> Ir %llu, exe %llu\n",
				2095	(bbcc->cost + ii->cost_offset)[0], exe_count);
				2096	CLG_(print_bbcc_cost)(-2, bbcc);
				2097	//CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
				2098	}
				2099	#endif
				2100
				2101	add_and_zero_Dx(ii->eventset, cost,
				2102	bbcc->cost + ii->cost_offset);
				2103	}
				2104	}
				2105
				2106	static
				2107	void cachesim_after_bbsetup(void)
				2108	{
				2109	BBCC* bbcc = CLG_(current_state).bbcc;
				2110
				2111	if (CLG_(clo).simulate_cache) {
				2112	BB* bb = bbcc->bb;
				2113
				2114	/* only needed if log_* functions are called */
				2115	bb_base = bb->obj->offset + bb->offset;
				2116	cost_base = bbcc->cost;
				2117	}
				2118	}
				2119
				2120	static
				2121	void cachesim_finish(void)
				2122	{
				2123	if (clo_collect_cacheuse)
				2124	cacheuse_finish();
				2125	}
				2126
				2127	/------------------------------------------------------------/
				2128	/--- The simulator defined in this file ---/
				2129	/------------------------------------------------------------/
				2130
				2131	struct cachesim_if CLG_(cachesim) = {
				2132	.print_opts = cachesim_print_opts,
				2133	.parse_opt = cachesim_parse_opt,
				2134	.post_clo_init = cachesim_post_clo_init,
				2135	.clear = cachesim_clear,
				2136	.getdesc = cachesim_getdesc,
				2137	.printstat = cachesim_printstat,
				2138	.add_icost = cachesim_add_icost,
				2139	.after_bbsetup = cachesim_after_bbsetup,
				2140	.finish = cachesim_finish,
				2141
				2142	/* these will be set by cachesim_post_clo_init */
				2143	.log_1I0D = 0,
				2144
				2145	.log_1I1Dr = 0,
				2146	.log_1I1Dw = 0,
				2147	.log_1I2D = 0,
				2148
				2149	.log_0I1Dr = 0,
				2150	.log_0I1Dw = 0,
				2151	.log_0I2D = 0,
				2152
				2153	.log_1I0D_name = "(no function)",
				2154
				2155	.log_1I1Dr_name = "(no function)",
				2156	.log_1I1Dw_name = "(no function)",
				2157	.log_1I2D_name = "(no function)",
				2158
				2159	.log_0I1Dr_name = "(no function)",
				2160	.log_0I1Dw_name = "(no function)",
				2161	.log_0I2D_name = "(no function)"
				2162	};
				2163
				2164
				2165	/--------------------------------------------------------------------/
				2166	/--- end ct_sim.c ---/
				2167	/--------------------------------------------------------------------/
				2168