Blame - callgrind/sim.c - platform/external/valgrind

blob: 1c4c3b5de414424cae8426680ac5f3e279b57ea9 [file] [log] [blame]

weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1
				2	/--------------------------------------------------------------------/
				3	/--- Cache simulation. ---/
				4	/--- sim.c ---/
				5	/--------------------------------------------------------------------/
				6
				7	/*
				8	This file is part of Callgrind.
				9	(c) 2003-2005, Josef Weidendorfer
				10
				11	Parts are Copyright (C) 2002 Nicholas Nethercote
				12	njn25@cam.ac.uk
				13
				14
				15	This program is free software; you can redistribute it and/or
				16	modify it under the terms of the GNU General Public License as
				17	published by the Free Software Foundation; either version 2 of the
				18	License, or (at your option) any later version.
				19
				20	This program is distributed in the hope that it will be useful, but
				21	WITHOUT ANY WARRANTY; without even the implied warranty of
				22	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				23	General Public License for more details.
				24
				25	You should have received a copy of the GNU General Public License
				26	along with this program; if not, write to the Free Software
				27	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
				28	02111-1307, USA.
				29
				30	The GNU General Public License is contained in the file COPYING.
				31	*/
				32
				33	#include "global.h"
				34
				35
				36	/* Notes:
				37	- simulates a write-allocate cache
				38	- (block --> set) hash function uses simple bit selection
				39	- handling of references straddling two cache blocks:
				40	- counts as only one cache access (not two)
				41	- both blocks hit --> one hit
				42	- one block hits, the other misses --> one miss
				43	- both blocks miss --> one miss (not two)
				44	*/
				45
				46	/* Cache configuration */
				47	#include "cg_arch.h"
				48
				49	/* additional structures for cache use info, separated
				50	* according usage frequency:
				51	* - line_loaded : pointer to cost center of instruction
				52	* which loaded the line into cache.
				53	* Needed to increment counters when line is evicted.
				54	* - line_use : updated on every access
				55	*/
				56	typedef struct {
				57	UInt count;
				58	UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
				59	} line_use;
				60
				61	typedef struct {
				62	Addr memline, iaddr;
				63	line_use* dep_use; /* point to higher-level cacheblock for this memline */
				64	ULong* use_base;
				65	} line_loaded;
				66
				67	/* Cache state */
				68	typedef struct {
				69	char* name;
				70	int size; /* bytes */
				71	int assoc;
				72	int line_size; /* bytes */
				73	Bool sectored; /* prefetch nearside cacheline on read */
				74	int sets;
				75	int sets_min_1;
				76	int assoc_bits;
				77	int line_size_bits;
				78	int tag_shift;
				79	UWord tag_mask;
				80	char desc_line[128];
				81	UWord* tags;
				82
				83	/* for cache use */
				84	int line_size_mask;
				85	int* line_start_mask;
				86	int* line_end_mask;
				87	line_loaded* loaded;
				88	line_use* use;
				89	} cache_t2;
				90
				91	/*
				92	* States of flat caches in our model.
				93	* We use a 2-level hierarchy,
				94	*/
				95	static cache_t2 I1, D1, L2;
				96
				97	/* Lower bits of cache tags are used as flags for a cache line */
				98	#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
				99	#define CACHELINE_DIRTY 1
				100
				101
				102	/* Cache simulator Options */
				103	static Bool clo_simulate_writeback = False;
				104	static Bool clo_simulate_hwpref = False;
				105	static Bool clo_simulate_sectors = False;
				106	static Bool clo_collect_cacheuse = False;
				107
				108	/* Following global vars are setup before by
				109	* setup_bbcc()/cachesim_after_bbsetup():
				110	*
				111	* - Addr bb_base (instruction start address of original BB)
				112	* - ULong* cost_base (start of cost array for BB)
				113	* - BBCC* nonskipped (only != 0 when in a function not skipped)
				114	*/
				115
				116	/* Offset to events in event set, used in log_* functions */
				117	static Int off_D0_Ir;
				118	static Int off_D1r_Ir;
				119	static Int off_D1r_Dr;
				120	static Int off_D1w_Ir;
				121	static Int off_D1w_Dw;
				122	static Int off_D2_Ir;
				123	static Int off_D2_Dr;
				124	static Int off_D2_Dw;
				125
				126	static Addr bb_base;
				127	static ULong* cost_base;
				128	static InstrInfo* current_ii;
				129
				130	/* Cache use offsets */
				131	/* FIXME: The offsets are only correct because all eventsets get
				132	* the "Use" set added first !
				133	*/
				134	static Int off_I1_AcCost = 0;
				135	static Int off_I1_SpLoss = 1;
				136	static Int off_D1_AcCost = 0;
				137	static Int off_D1_SpLoss = 1;
				138	static Int off_L2_AcCost = 2;
				139	static Int off_L2_SpLoss = 3;
				140
				141	/* Cache access types */
				142	typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
				143
				144	/* Result of a reference into a flat cache */
				145	typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
				146
				147	/* Result of a reference into a hierarchical cache model */
				148	typedef enum {
				149	L1_Hit,
				150	L2_Hit,
				151	MemAccess,
				152	WriteBackMemAccess } CacheModelResult;
				153
				154	typedef CacheModelResult (*simcall_type)(Addr, UChar);
				155
				156	static struct {
				157	simcall_type I1_Read;
				158	simcall_type D1_Read;
				159	simcall_type D1_Write;
				160	} simulator;
				161
				162	/------------------------------------------------------------/
				163	/--- Cache Simulator Initialization ---/
				164	/------------------------------------------------------------/
				165
				166	static void cachesim_clearcache(cache_t2* c)
				167	{
				168	Int i;
				169
				170	for (i = 0; i < c->sets * c->assoc; i++)
				171	c->tags[i] = 0;
				172	if (c->use) {
				173	for (i = 0; i < c->sets * c->assoc; i++) {
				174	c->loaded[i].memline = 0;
				175	c->loaded[i].use_base = 0;
				176	c->loaded[i].dep_use = 0;
				177	c->loaded[i].iaddr = 0;
				178	c->use[i].mask = 0;
				179	c->use[i].count = 0;
				180	c->tags[i] = i % c->assoc; /* init lower bits as pointer */
				181	}
				182	}
				183	}
				184
				185	static void cacheuse_initcache(cache_t2* c);
				186
				187	/* By this point, the size/assoc/line_size has been checked. */
				188	static void cachesim_initcache(cache_t config, cache_t2* c)
				189	{
				190	c->size = config.size;
				191	c->assoc = config.assoc;
				192	c->line_size = config.line_size;
				193	c->sectored = False; // FIXME
				194
				195	c->sets = (c->size / c->line_size) / c->assoc;
				196	c->sets_min_1 = c->sets - 1;
				197	c->assoc_bits = VG_(log2)(c->assoc);
				198	c->line_size_bits = VG_(log2)(c->line_size);
				199	c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
				200	c->tag_mask = ~((1<<c->tag_shift)-1);
				201
				202	/* Can bits in tag entries be used for flags?
				203	* Should be always true as MIN_LINE_SIZE >= 16 */
				204	CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
				205
				206	if (c->assoc == 1) {
				207	VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
				208	c->size, c->line_size,
				209	c->sectored ? ", sectored":"");
				210	} else {
				211	VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
				212	c->size, c->line_size, c->assoc,
				213	c->sectored ? ", sectored":"");
				214	}
				215
				216	c->tags = (UWord) CLG_MALLOC(sizeof(UWord) c->sets * c->assoc);
				217	if (clo_collect_cacheuse)
				218	cacheuse_initcache(c);
				219	else
				220	c->use = 0;
				221	cachesim_clearcache(c);
				222	}
				223
				224
				225	#if 0
				226	static void print_cache(cache_t2* c)
				227	{
				228	UInt set, way, i;
				229
				230	/* Note initialisation and update of 'i'. */
				231	for (i = 0, set = 0; set < c->sets; set++) {
				232	for (way = 0; way < c->assoc; way++, i++) {
				233	VG_(printf)("%8x ", c->tags[i]);
				234	}
				235	VG_(printf)("\n");
				236	}
				237	}
				238	#endif
				239
				240
				241	/------------------------------------------------------------/
				242	/--- Write Through Cache Simulation ---/
				243	/------------------------------------------------------------/
				244
				245	/*
				246	* Simple model: L1 & L2 Write Through
				247	* Does not distinguish among read and write references
				248	*
				249	* Simulator functions:
				250	* CacheModelResult cachesim_I1_ref(Addr a, UChar size)
				251	* CacheModelResult cachesim_D1_ref(Addr a, UChar size)
				252	*/
				253
				254	static __inline__
				255	CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
				256	{
				257	int i, j;
				258	UWord *set;
				259
				260	/* Shifting is a bit faster than multiplying */
				261	set = &(c->tags[set_no << c->assoc_bits]);
				262
				263	/* This loop is unrolled for just the first case, which is the most */
				264	/* common. We can't unroll any further because it would screw up */
				265	/* if we have a direct-mapped (1-way) cache. */
				266	if (tag == set[0])
				267	return Hit;
				268
				269	/* If the tag is one other than the MRU, move it into the MRU spot */
				270	/* and shuffle the rest down. */
				271	for (i = 1; i < c->assoc; i++) {
				272	if (tag == set[i]) {
				273	for (j = i; j > 0; j--) {
				274	set[j] = set[j - 1];
				275	}
				276	set[0] = tag;
				277	return Hit;
				278	}
				279	}
				280
				281	/* A miss; install this tag as MRU, shuffle rest down. */
				282	for (j = c->assoc - 1; j > 0; j--) {
				283	set[j] = set[j - 1];
				284	}
				285	set[0] = tag;
				286
				287	return Miss;
				288	}
				289
				290	static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
				291	{
				292	UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
				293	UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
				294	UWord tag = a >> c->tag_shift;
				295
				296	/* Access entirely within line. */
				297	if (set1 == set2)
				298	return cachesim_setref(c, set1, tag);
				299
				300	/* Access straddles two lines. */
				301	/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
				302	else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	303	UWord tag2 = (a+size-1) >> c->tag_shift;
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	304
				305	/* the call updates cache structures as side effect */
				306	CacheResult res1 = cachesim_setref(c, set1, tag);
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	307	CacheResult res2 = cachesim_setref(c, set2, tag2);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	308	return ((res1 == Miss) \|\| (res2 == Miss)) ? Miss : Hit;
				309
				310	} else {
				311	VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
				312	VG_(tool_panic)("item straddles more than two cache sets");
				313	}
				314	return Hit;
				315	}
				316
				317	static
				318	CacheModelResult cachesim_I1_ref(Addr a, UChar size)
				319	{
				320	if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
				321	if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
				322	return MemAccess;
				323	}
				324
				325	static
				326	CacheModelResult cachesim_D1_ref(Addr a, UChar size)
				327	{
				328	if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
				329	if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
				330	return MemAccess;
				331	}
				332
				333
				334	/------------------------------------------------------------/
				335	/--- Write Back Cache Simulation ---/
				336	/------------------------------------------------------------/
				337
				338	/*
				339	* More complex model: L1 Write-through, L2 Write-back
				340	* This needs to distinguish among read and write references.
				341	*
				342	* Simulator functions:
				343	* CacheModelResult cachesim_I1_Read(Addr a, UChar size)
				344	* CacheModelResult cachesim_D1_Read(Addr a, UChar size)
				345	* CacheModelResult cachesim_D1_Write(Addr a, UChar size)
				346	*/
				347
				348	/*
				349	* With write-back, result can be a miss evicting a dirty line
				350	* The dirty state of a cache line is stored in Bit0 of the tag for
				351	* this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
				352	* type (Read/Write), the line gets dirty on a write.
				353	*/
				354	static __inline__
				355	CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
				356	{
				357	int i, j;
				358	UWord *set, tmp_tag;
				359
				360	/* Shifting is a bit faster than multiplying */
				361	set = &(c->tags[set_no << c->assoc_bits]);
				362
				363	/* This loop is unrolled for just the first case, which is the most */
				364	/* common. We can't unroll any further because it would screw up */
				365	/* if we have a direct-mapped (1-way) cache. */
				366	if (tag == (set[0] & ~CACHELINE_DIRTY)) {
				367	set[0] \|= ref;
				368	return Hit;
				369	}
				370	/* If the tag is one other than the MRU, move it into the MRU spot */
				371	/* and shuffle the rest down. */
				372	for (i = 1; i < c->assoc; i++) {
				373	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
				374	tmp_tag = set[i] \| ref; // update dirty flag
				375	for (j = i; j > 0; j--) {
				376	set[j] = set[j - 1];
				377	}
				378	set[0] = tmp_tag;
				379	return Hit;
				380	}
				381	}
				382
				383	/* A miss; install this tag as MRU, shuffle rest down. */
				384	tmp_tag = set[c->assoc - 1];
				385	for (j = c->assoc - 1; j > 0; j--) {
				386	set[j] = set[j - 1];
				387	}
				388	set[0] = tag \| ref;
				389
				390	return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
				391	}
				392
				393
				394	static __inline__
				395	CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
				396	{
				397	UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
				398	UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
				399	UWord tag = a & c->tag_mask;
				400
				401	/* Access entirely within line. */
				402	if (set1 == set2)
				403	return cachesim_setref_wb(c, ref, set1, tag);
				404
				405	/* Access straddles two lines. */
				406	/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
				407	else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	408	UWord tag2 = (a+size-1) >> c->tag_shift;
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	409
				410	/* the call updates cache structures as side effect */
				411	CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	412	CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	413
				414	if ((res1 == MissDirty) \|\| (res2 == MissDirty)) return MissDirty;
				415	return ((res1 == Miss) \|\| (res2 == Miss)) ? Miss : Hit;
				416
				417	} else {
				418	VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
				419	VG_(tool_panic)("item straddles more than two cache sets");
				420	}
				421	return Hit;
				422	}
				423
				424
				425	static
				426	CacheModelResult cachesim_I1_Read(Addr a, UChar size)
				427	{
				428	if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
				429	switch( cachesim_ref_wb( &L2, Read, a, size) ) {
				430	case Hit: return L2_Hit;
				431	case Miss: return MemAccess;
				432	default: break;
				433	}
				434	return WriteBackMemAccess;
				435	}
				436
				437	static
				438	CacheModelResult cachesim_D1_Read(Addr a, UChar size)
				439	{
				440	if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
				441	switch( cachesim_ref_wb( &L2, Read, a, size) ) {
				442	case Hit: return L2_Hit;
				443	case Miss: return MemAccess;
				444	default: break;
				445	}
				446	return WriteBackMemAccess;
				447	}
				448
				449	static
				450	CacheModelResult cachesim_D1_Write(Addr a, UChar size)
				451	{
				452	if ( cachesim_ref( &D1, a, size) == Hit ) {
				453	/* Even for a L1 hit, the write-trough L1 passes
				454	* the write to the L2 to make the L2 line dirty.
				455	* But this causes no latency, so return the hit.
				456	*/
				457	cachesim_ref_wb( &L2, Write, a, size);
				458	return L1_Hit;
				459	}
				460	switch( cachesim_ref_wb( &L2, Write, a, size) ) {
				461	case Hit: return L2_Hit;
				462	case Miss: return MemAccess;
				463	default: break;
				464	}
				465	return WriteBackMemAccess;
				466	}
				467
				468
				469	/------------------------------------------------------------/
				470	/--- Hardware Prefetch Simulation ---/
				471	/------------------------------------------------------------/
				472
				473	static ULong prefetch_up = 0;
				474	static ULong prefetch_down = 0;
				475
				476	#define PF_STREAMS 8
				477	#define PF_PAGEBITS 12
				478
				479	static UInt pf_lastblock[PF_STREAMS];
				480	static Int pf_seqblocks[PF_STREAMS];
				481
				482	static
				483	void prefetch_clear(void)
				484	{
				485	int i;
				486	for(i=0;i<PF_STREAMS;i++)
				487	pf_lastblock[i] = pf_seqblocks[i] = 0;
				488	}
				489
				490	/*
				491	* HW Prefetch emulation
				492	* Start prefetching when detecting sequential access to 3 memory blocks.
				493	* One stream can be detected per 4k page.
				494	*/
				495	static __inline__
				496	void prefetch_L2_doref(Addr a, UChar size)
				497	{
				498	UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
				499	UInt block = ( a >> L2.line_size_bits);
				500
				501	if (block != pf_lastblock[stream]) {
				502	if (pf_seqblocks[stream] == 0) {
				503	if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
				504	else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
				505	}
				506	else if (pf_seqblocks[stream] >0) {
				507	if (pf_lastblock[stream] +1 == block) {
				508	pf_seqblocks[stream]++;
				509	if (pf_seqblocks[stream] >= 2) {
				510	prefetch_up++;
				511	cachesim_ref(&L2, a + 5 * L2.line_size,1);
				512	}
				513	}
				514	else pf_seqblocks[stream] = 0;
				515	}
				516	else if (pf_seqblocks[stream] <0) {
				517	if (pf_lastblock[stream] -1 == block) {
				518	pf_seqblocks[stream]--;
				519	if (pf_seqblocks[stream] <= -2) {
				520	prefetch_down++;
				521	cachesim_ref(&L2, a - 5 * L2.line_size,1);
				522	}
				523	}
				524	else pf_seqblocks[stream] = 0;
				525	}
				526	pf_lastblock[stream] = block;
				527	}
				528	}
				529
				530	/* simple model with hardware prefetch */
				531
				532	static
				533	CacheModelResult prefetch_I1_ref(Addr a, UChar size)
				534	{
				535	if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
				536	prefetch_L2_doref(a,size);
				537	if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
				538	return MemAccess;
				539	}
				540
				541	static
				542	CacheModelResult prefetch_D1_ref(Addr a, UChar size)
				543	{
				544	if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
				545	prefetch_L2_doref(a,size);
				546	if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
				547	return MemAccess;
				548	}
				549
				550
				551	/* complex model with hardware prefetch */
				552
				553	static
				554	CacheModelResult prefetch_I1_Read(Addr a, UChar size)
				555	{
				556	if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
				557	prefetch_L2_doref(a,size);
				558	switch( cachesim_ref_wb( &L2, Read, a, size) ) {
				559	case Hit: return L2_Hit;
				560	case Miss: return MemAccess;
				561	default: break;
				562	}
				563	return WriteBackMemAccess;
				564	}
				565
				566	static
				567	CacheModelResult prefetch_D1_Read(Addr a, UChar size)
				568	{
				569	if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
				570	prefetch_L2_doref(a,size);
				571	switch( cachesim_ref_wb( &L2, Read, a, size) ) {
				572	case Hit: return L2_Hit;
				573	case Miss: return MemAccess;
				574	default: break;
				575	}
				576	return WriteBackMemAccess;
				577	}
				578
				579	static
				580	CacheModelResult prefetch_D1_Write(Addr a, UChar size)
				581	{
				582	prefetch_L2_doref(a,size);
				583	if ( cachesim_ref( &D1, a, size) == Hit ) {
				584	/* Even for a L1 hit, the write-trough L1 passes
				585	* the write to the L2 to make the L2 line dirty.
				586	* But this causes no latency, so return the hit.
				587	*/
				588	cachesim_ref_wb( &L2, Write, a, size);
				589	return L1_Hit;
				590	}
				591	switch( cachesim_ref_wb( &L2, Write, a, size) ) {
				592	case Hit: return L2_Hit;
				593	case Miss: return MemAccess;
				594	default: break;
				595	}
				596	return WriteBackMemAccess;
				597	}
				598
				599
				600	/------------------------------------------------------------/
				601	/--- Cache Simulation with use metric collection ---/
				602	/------------------------------------------------------------/
				603
				604	/* can not be combined with write-back or prefetch */
				605
				606	static
				607	void cacheuse_initcache(cache_t2* c)
				608	{
				609	int i;
				610	unsigned int start_mask, start_val;
				611	unsigned int end_mask, end_val;
				612
				613	c->use = CLG_MALLOC(sizeof(line_use) * c->sets * c->assoc);
				614	c->loaded = CLG_MALLOC(sizeof(line_loaded) * c->sets * c->assoc);
				615	c->line_start_mask = CLG_MALLOC(sizeof(int) * c->line_size);
				616	c->line_end_mask = CLG_MALLOC(sizeof(int) * c->line_size);
				617
				618
				619	c->line_size_mask = c->line_size-1;
				620
				621	/* Meaning of line_start_mask/line_end_mask
				622	* Example: for a given cache line, you get an access starting at
				623	* byte offset 5, length 4, byte 5 - 8 was touched. For a cache
				624	* line size of 32, you have 1 bit per byte in the mask:
				625	*
				626	* bit31 bit8 bit5 bit 0
				627	* \| \| \| \|
				628	* 11..111111100000 line_start_mask[5]
				629	* 00..000111111111 line_end_mask[(5+4)-1]
				630	*
				631	* use_mask \|= line_start_mask[5] && line_end_mask[8]
				632	*
				633	*/
				634	start_val = end_val = ~0;
				635	if (c->line_size < 32) {
				636	int bits_per_byte = 32/c->line_size;
				637	start_mask = (1<<bits_per_byte)-1;
				638	end_mask = start_mask << (32-bits_per_byte);
				639	for(i=0;i<c->line_size;i++) {
				640	c->line_start_mask[i] = start_val;
				641	start_val = start_val & ~start_mask;
				642	start_mask = start_mask << bits_per_byte;
				643
				644	c->line_end_mask[c->line_size-i-1] = end_val;
				645	end_val = end_val & ~end_mask;
				646	end_mask = end_mask >> bits_per_byte;
				647	}
				648	}
				649	else {
				650	int bytes_per_bit = c->line_size/32;
				651	start_mask = 1;
				652	end_mask = 1 << 31;
				653	for(i=0;i<c->line_size;i++) {
				654	c->line_start_mask[i] = start_val;
				655	c->line_end_mask[c->line_size-i-1] = end_val;
				656	if ( ((i+1)%bytes_per_bit) == 0) {
				657	start_val &= ~start_mask;
				658	end_val &= ~end_mask;
				659	start_mask <<= 1;
				660	end_mask >>= 1;
				661	}
				662	}
				663	}
				664
				665	CLG_DEBUG(6, "Config %s:\n", c->desc_line);
				666	for(i=0;i<c->line_size;i++) {
				667	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
				668	i, c->line_start_mask[i], c->line_end_mask[i]);
				669	}
				670
				671	/* We use lower tag bits as offset pointers to cache use info.
				672	* I.e. some cache parameters don't work.
				673	*/
				674	if (c->tag_shift < c->assoc_bits) {
				675	VG_(message)(Vg_DebugMsg,
				676	"error: Use associativity < %d for cache use statistics!",
				677	(1<<c->tag_shift) );
				678	VG_(tool_panic)("Unsupported cache configuration");
				679	}
				680	}
				681
				682	/* FIXME: A little tricky */
				683	#if 0
				684
				685	static __inline__
				686	void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
				687	{
				688	int idx = (high_idx << c->assoc_bits) \| low_idx;
				689
				690	c->use[idx].count ++;
				691	c->use[idx].mask \|= use_mask;
				692
				693	CLG_DEBUG(6," Hit [idx %d] (line %p from %p): %x => %08x, count %d\n",
				694	idx, c->loaded[idx].memline, c->loaded[idx].iaddr,
				695	use_mask, c->use[idx].mask, c->use[idx].count);
				696	}
				697
				698	/* only used for I1, D1 */
				699
				700	static __inline__
				701	CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
				702	{
				703	int i, j, idx;
				704	UWord *set, tmp_tag;
				705	UInt use_mask;
				706
				707	/* Shifting is a bit faster than multiplying */
				708	set = &(c->tags[set_no << c->assoc_bits]);
				709	use_mask =
				710	c->line_start_mask[a & c->line_size_mask] &
				711	c->line_end_mask[(a+size-1) & c->line_size_mask];
				712
				713	/* This loop is unrolled for just the first case, which is the most */
				714	/* common. We can't unroll any further because it would screw up */
				715	/* if we have a direct-mapped (1-way) cache. */
				716	if (tag == (set[0] & c->tag_mask)) {
				717	cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
				718	return L1_Hit;
				719	}
				720
				721	/* If the tag is one other than the MRU, move it into the MRU spot */
				722	/* and shuffle the rest down. */
				723	for (i = 1; i < c->assoc; i++) {
				724	if (tag == (set[i] & c->tag_mask)) {
				725	tmp_tag = set[i];
				726	for (j = i; j > 0; j--) {
				727	set[j] = set[j - 1];
				728	}
				729	set[0] = tmp_tag;
				730
				731	cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
				732	return L1_Hit;
				733	}
				734	}
				735
				736	/* A miss; install this tag as MRU, shuffle rest down. */
				737	tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
				738	for (j = c->assoc - 1; j > 0; j--) {
				739	set[j] = set[j - 1];
				740	}
				741	set[0] = tag \| tmp_tag;
				742
				743	cacheuse_L2_miss(c, (set_no << c->assoc_bits) \| tmp_tag,
				744	use_mask, a & ~c->line_size_mask);
				745
				746	return Miss;
				747	}
				748
				749
				750	static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
				751	{
				752	UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
				753	UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
				754	UWord tag = a >> c->tag_shift;
				755
				756	/* Access entirely within line. */
				757	if (set1 == set2)
				758	return cacheuse_setref(c, set1, tag);
				759
				760	/* Access straddles two lines. */
				761	/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
				762	else if (((set1 + 1) & (c->sets-1)) == set2) {
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	763	UWord tag2 = a >> c->tag_shift;
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	764
				765	/* the call updates cache structures as side effect */
				766	CacheResult res1 = cacheuse_isMiss(c, set1, tag);
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	767	CacheResult res2 = cacheuse_isMiss(c, set2, tag2);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	768	return ((res1 == Miss) \|\| (res2 == Miss)) ? Miss : Hit;
				769
				770	} else {
				771	VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
				772	VG_(tool_panic)("item straddles more than two cache sets");
				773	}
				774	return Hit;
				775	}
				776	#endif
				777
				778
				779	/* for I1/D1 caches */
				780	#define CACHEUSE(L) \
				781	\
				782	static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
				783	{ \
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	784	UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
				785	UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
				786	UWord tag = a & L.tag_mask; \
				787	UWord tag2; \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	788	int i, j, idx; \
				789	UWord *set, tmp_tag; \
				790	UInt use_mask; \
				791	\
				792	CLG_DEBUG(6,"%s.Acc(Addr %p, size %d): Sets [%d/%d]\n", \
				793	L.name, a, size, set1, set2); \
				794	\
				795	/* First case: word entirely within line. */ \
				796	if (set1 == set2) { \
				797	\
				798	/* Shifting is a bit faster than multiplying */ \
				799	set = &(L.tags[set1 << L.assoc_bits]); \
				800	use_mask = L.line_start_mask[a & L.line_size_mask] & \
				801	L.line_end_mask[(a+size-1) & L.line_size_mask]; \
				802	\
				803	/* This loop is unrolled for just the first case, which is the most */\
				804	/* common. We can't unroll any further because it would screw up */\
				805	/* if we have a direct-mapped (1-way) cache. */\
				806	if (tag == (set[0] & L.tag_mask)) { \
				807	idx = (set1 << L.assoc_bits) \| (set[0] & ~L.tag_mask); \
				808	L.use[idx].count ++; \
				809	L.use[idx].mask \|= use_mask; \
				810	CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
				811	idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				812	use_mask, L.use[idx].mask, L.use[idx].count); \
				813	return L1_Hit; \
				814	} \
				815	/* If the tag is one other than the MRU, move it into the MRU spot */\
				816	/* and shuffle the rest down. */\
				817	for (i = 1; i < L.assoc; i++) { \
				818	if (tag == (set[i] & L.tag_mask)) { \
				819	tmp_tag = set[i]; \
				820	for (j = i; j > 0; j--) { \
				821	set[j] = set[j - 1]; \
				822	} \
				823	set[0] = tmp_tag; \
				824	idx = (set1 << L.assoc_bits) \| (tmp_tag & ~L.tag_mask); \
				825	L.use[idx].count ++; \
				826	L.use[idx].mask \|= use_mask; \
				827	CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
				828	i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				829	use_mask, L.use[idx].mask, L.use[idx].count); \
				830	return L1_Hit; \
				831	} \
				832	} \
				833	\
				834	/* A miss; install this tag as MRU, shuffle rest down. */ \
				835	tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
				836	for (j = L.assoc - 1; j > 0; j--) { \
				837	set[j] = set[j - 1]; \
				838	} \
				839	set[0] = tag \| tmp_tag; \
				840	idx = (set1 << L.assoc_bits) \| tmp_tag; \
				841	return update_##L##_use(&L, idx, \
				842	use_mask, a &~ L.line_size_mask); \
				843	\
				844	/* Second case: word straddles two lines. */ \
				845	/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
				846	} else if (((set1 + 1) & (L.sets-1)) == set2) { \
				847	Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
				848	set = &(L.tags[set1 << L.assoc_bits]); \
				849	use_mask = L.line_start_mask[a & L.line_size_mask]; \
				850	if (tag == (set[0] & L.tag_mask)) { \
				851	idx = (set1 << L.assoc_bits) \| (set[0] & ~L.tag_mask); \
				852	L.use[idx].count ++; \
				853	L.use[idx].mask \|= use_mask; \
				854	CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
				855	idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				856	use_mask, L.use[idx].mask, L.use[idx].count); \
				857	goto block2; \
				858	} \
				859	for (i = 1; i < L.assoc; i++) { \
				860	if (tag == (set[i] & L.tag_mask)) { \
				861	tmp_tag = set[i]; \
				862	for (j = i; j > 0; j--) { \
				863	set[j] = set[j - 1]; \
				864	} \
				865	set[0] = tmp_tag; \
				866	idx = (set1 << L.assoc_bits) \| (tmp_tag & ~L.tag_mask); \
				867	L.use[idx].count ++; \
				868	L.use[idx].mask \|= use_mask; \
				869	CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
				870	i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				871	use_mask, L.use[idx].mask, L.use[idx].count); \
				872	goto block2; \
				873	} \
				874	} \
				875	tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
				876	for (j = L.assoc - 1; j > 0; j--) { \
				877	set[j] = set[j - 1]; \
				878	} \
				879	set[0] = tag \| tmp_tag; \
				880	idx = (set1 << L.assoc_bits) \| tmp_tag; \
				881	miss1 = update_##L##_use(&L, idx, \
				882	use_mask, a &~ L.line_size_mask); \
				883	block2: \
				884	set = &(L.tags[set2 << L.assoc_bits]); \
				885	use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	886	tag2 = (a+size-1) & L.tag_mask; \
				887	if (tag2 == (set[0] & L.tag_mask)) { \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	888	idx = (set2 << L.assoc_bits) \| (set[0] & ~L.tag_mask); \
				889	L.use[idx].count ++; \
				890	L.use[idx].mask \|= use_mask; \
				891	CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
				892	idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				893	use_mask, L.use[idx].mask, L.use[idx].count); \
				894	return miss1; \
				895	} \
				896	for (i = 1; i < L.assoc; i++) { \
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	897	if (tag2 == (set[i] & L.tag_mask)) { \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	898	tmp_tag = set[i]; \
				899	for (j = i; j > 0; j--) { \
				900	set[j] = set[j - 1]; \
				901	} \
				902	set[0] = tmp_tag; \
				903	idx = (set2 << L.assoc_bits) \| (tmp_tag & ~L.tag_mask); \
				904	L.use[idx].count ++; \
				905	L.use[idx].mask \|= use_mask; \
				906	CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
				907	i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
				908	use_mask, L.use[idx].mask, L.use[idx].count); \
				909	return miss1; \
				910	} \
				911	} \
				912	tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
				913	for (j = L.assoc - 1; j > 0; j--) { \
				914	set[j] = set[j - 1]; \
				915	} \
weidendo	28e2a14	2006-11-22 21:00:53 +0000	[diff] [blame^]	916	set[0] = tag2 \| tmp_tag; \
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	917	idx = (set2 << L.assoc_bits) \| tmp_tag; \
				918	miss2 = update_##L##_use(&L, idx, \
				919	use_mask, (a+size-1) &~ L.line_size_mask); \
				920	return (miss1==MemAccess \|\| miss2==MemAccess) ? MemAccess:L2_Hit; \
				921	\
				922	} else { \
				923	VG_(printf)("addr: %p size: %u sets: %d %d", a, size, set1, set2); \
				924	VG_(tool_panic)("item straddles more than two cache sets"); \
				925	} \
				926	return 0; \
				927	}
				928
				929
				930	/* logarithmic bitcounting algorithm, see
				931	* http://graphics.stanford.edu/~seander/bithacks.html
				932	*/
				933	static __inline__ unsigned int countBits(unsigned int bits)
				934	{
				935	unsigned int c; // store the total here
				936	const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
				937	const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
				938
				939	c = bits;
				940	c = ((c >> S[0]) & B[0]) + (c & B[0]);
				941	c = ((c >> S[1]) & B[1]) + (c & B[1]);
				942	c = ((c >> S[2]) & B[2]) + (c & B[2]);
				943	c = ((c >> S[3]) & B[3]) + (c & B[3]);
				944	c = ((c >> S[4]) & B[4]) + (c & B[4]);
				945	return c;
				946	}
				947
				948	static void update_L2_use(int idx, Addr memline)
				949	{
				950	line_loaded* loaded = &(L2.loaded[idx]);
				951	line_use* use = &(L2.use[idx]);
				952	int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
				953
				954	CLG_DEBUG(2, " L2.miss [%d]: at %p accessing memline %p\n",
				955	idx, bb_base + current_ii->instr_offset, memline);
				956	if (use->count>0) {
				957	CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %p from %p]\n",
				958	use->count, i, use->mask, loaded->memline, loaded->iaddr);
				959	CLG_DEBUG(2, " collect: %d, use_base %p\n",
				960	CLG_(current_state).collect, loaded->use_base);
				961
				962	if (CLG_(current_state).collect && loaded->use_base) {
				963	(loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
				964	(loaded->use_base)[off_L2_SpLoss] += i;
				965	}
				966	}
				967
				968	use->count = 0;
				969	use->mask = 0;
				970
				971	loaded->memline = memline;
				972	loaded->iaddr = bb_base + current_ii->instr_offset;
				973	loaded->use_base = (CLG_(current_state).nonskipped) ?
				974	CLG_(current_state).nonskipped->skipped :
				975	cost_base + current_ii->cost_offset;
				976	}
				977
				978	static
				979	CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
				980	{
				981	UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
				982	UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
				983	UWord tag = memline & L2.tag_mask;
				984
				985	int i, j, idx;
				986	UWord tmp_tag;
				987
				988	CLG_DEBUG(6,"L2.Acc(Memline %p): Set %d\n", memline, setNo);
				989
				990	if (tag == (set[0] & L2.tag_mask)) {
				991	idx = (setNo << L2.assoc_bits) \| (set[0] & ~L2.tag_mask);
				992	l1_loaded->dep_use = &(L2.use[idx]);
				993
				994	CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): => %08x, count %d\n",
				995	idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
				996	L2.use[idx].mask, L2.use[idx].count);
				997	return L2_Hit;
				998	}
				999	for (i = 1; i < L2.assoc; i++) {
				1000	if (tag == (set[i] & L2.tag_mask)) {
				1001	tmp_tag = set[i];
				1002	for (j = i; j > 0; j--) {
				1003	set[j] = set[j - 1];
				1004	}
				1005	set[0] = tmp_tag;
				1006	idx = (setNo << L2.assoc_bits) \| (tmp_tag & ~L2.tag_mask);
				1007	l1_loaded->dep_use = &(L2.use[idx]);
				1008
				1009	CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): => %08x, count %d\n",
				1010	i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
				1011	L2.use[idx].mask, L2.use[idx].count);
				1012	return L2_Hit;
				1013	}
				1014	}
				1015
				1016	/* A miss; install this tag as MRU, shuffle rest down. */
				1017	tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
				1018	for (j = L2.assoc - 1; j > 0; j--) {
				1019	set[j] = set[j - 1];
				1020	}
				1021	set[0] = tag \| tmp_tag;
				1022	idx = (setNo << L2.assoc_bits) \| tmp_tag;
				1023	l1_loaded->dep_use = &(L2.use[idx]);
				1024
				1025	update_L2_use(idx, memline);
				1026
				1027	return MemAccess;
				1028	}
				1029
				1030
				1031
				1032
				1033	#define UPDATE_USE(L) \
				1034	\
				1035	static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
				1036	UInt mask, Addr memline) \
				1037	{ \
				1038	line_loaded* loaded = &(cache->loaded[idx]); \
				1039	line_use* use = &(cache->use[idx]); \
				1040	int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
				1041	\
				1042	CLG_DEBUG(2, " %s.miss [%d]: at %p accessing memline %p (mask %08x)\n", \
				1043	cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
				1044	if (use->count>0) { \
				1045	CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %p from %p]\n",\
				1046	use->count, c, use->mask, loaded->memline, loaded->iaddr); \
				1047	CLG_DEBUG(2, " collect: %d, use_base %p\n", \
				1048	CLG_(current_state).collect, loaded->use_base); \
				1049	\
				1050	if (CLG_(current_state).collect && loaded->use_base) { \
				1051	(loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
				1052	(loaded->use_base)[off_##L##_SpLoss] += c; \
				1053	\
				1054	/* FIXME (?): L1/L2 line sizes must be equal ! */ \
				1055	loaded->dep_use->mask \|= use->mask; \
				1056	loaded->dep_use->count += use->count; \
				1057	} \
				1058	} \
				1059	\
				1060	use->count = 1; \
				1061	use->mask = mask; \
				1062	loaded->memline = memline; \
				1063	loaded->iaddr = bb_base + current_ii->instr_offset; \
				1064	loaded->use_base = (CLG_(current_state).nonskipped) ? \
				1065	CLG_(current_state).nonskipped->skipped : \
				1066	cost_base + current_ii->cost_offset; \
				1067	\
				1068	if (memline == 0) return L2_Hit; \
				1069	return cacheuse_L2_access(memline, loaded); \
				1070	}
				1071
				1072	UPDATE_USE(I1);
				1073	UPDATE_USE(D1);
				1074
				1075	CACHEUSE(I1);
				1076	CACHEUSE(D1);
				1077
				1078
				1079	static
				1080	void cacheuse_finish(void)
				1081	{
				1082	int i;
				1083	InstrInfo ii = { 0,0,0,0,0 };
				1084
				1085	if (!CLG_(current_state).collect) return;
				1086
				1087	bb_base = 0;
				1088	current_ii = &ii;
				1089	cost_base = 0;
				1090
				1091	/* update usage counters */
				1092	if (I1.use)
				1093	for (i = 0; i < I1.sets * I1.assoc; i++)
				1094	if (I1.loaded[i].use_base)
				1095	update_I1_use( &I1, i, 0,0);
				1096
				1097	if (D1.use)
				1098	for (i = 0; i < D1.sets * D1.assoc; i++)
				1099	if (D1.loaded[i].use_base)
				1100	update_D1_use( &D1, i, 0,0);
				1101
				1102	if (L2.use)
				1103	for (i = 0; i < L2.sets * L2.assoc; i++)
				1104	if (L2.loaded[i].use_base)
				1105	update_L2_use(i, 0);
				1106	}
				1107
				1108
				1109
				1110	/------------------------------------------------------------/
				1111	/--- Helper functions called by instrumented code ---/
				1112	/------------------------------------------------------------/
				1113
				1114
				1115	static __inline__
				1116	void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
				1117	{
				1118	switch(r) {
				1119	case WriteBackMemAccess:
				1120	if (clo_simulate_writeback) {
				1121	c1[3]++;
				1122	c2[3]++;
				1123	}
				1124	// fall through
				1125
				1126	case MemAccess:
				1127	c1[2]++;
				1128	c2[2]++;
				1129	// fall through
				1130
				1131	case L2_Hit:
				1132	c1[1]++;
				1133	c2[1]++;
				1134	// fall through
				1135
				1136	default:
				1137	c1[0]++;
				1138	c2[0]++;
				1139	}
				1140	}
				1141
				1142
				1143	VG_REGPARM(1)
				1144	static void log_1I0D(InstrInfo* ii)
				1145	{
				1146	CacheModelResult IrRes;
				1147
				1148	current_ii = ii;
				1149	IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
				1150
				1151	CLG_DEBUG(6, "log_1I0D: Ir=%p/%u => Ir %d\n",
				1152	bb_base + ii->instr_offset, ii->instr_size, IrRes);
				1153
				1154	if (CLG_(current_state).collect) {
				1155	ULong* cost_Ir;
				1156
				1157	if (CLG_(current_state).nonskipped)
				1158	cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
				1159	else
				1160	cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
				1161
				1162	inc_costs(IrRes, cost_Ir,
				1163	CLG_(current_state).cost + CLG_(sets).off_full_Ir );
				1164	}
				1165	}
				1166
				1167
				1168	/* Instruction doing a read access */
				1169
				1170	VG_REGPARM(2)
				1171	static void log_1I1Dr(InstrInfo* ii, Addr data)
				1172	{
				1173	CacheModelResult IrRes, DrRes;
				1174
				1175	current_ii = ii;
				1176	IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
				1177	DrRes = (*simulator.D1_Read)(data, ii->data_size);
				1178
				1179	CLG_DEBUG(6, "log_1I1Dr: Ir=%p/%u, Dr=%p/%u => Ir %d, Dr %d\n",
				1180	bb_base + ii->instr_offset, ii->instr_size,
				1181	data, ii->data_size, IrRes, DrRes);
				1182
				1183	if (CLG_(current_state).collect) {
				1184	ULong cost_Ir, cost_Dr;
				1185
				1186	if (CLG_(current_state).nonskipped) {
				1187	cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
				1188	cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
				1189	}
				1190	else {
				1191	cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
				1192	cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
				1193	}
				1194
				1195	inc_costs(IrRes, cost_Ir,
				1196	CLG_(current_state).cost + CLG_(sets).off_full_Ir );
				1197	inc_costs(DrRes, cost_Dr,
				1198	CLG_(current_state).cost + CLG_(sets).off_full_Dr );
				1199	}
				1200	}
				1201
				1202
				1203	VG_REGPARM(2)
				1204	static void log_0I1Dr(InstrInfo* ii, Addr data)
				1205	{
				1206	CacheModelResult DrRes;
				1207
				1208	current_ii = ii;
				1209	DrRes = (*simulator.D1_Read)(data, ii->data_size);
				1210
				1211	CLG_DEBUG(6, "log_0I1Dr: Dr=%p/%u => Dr %d\n",
				1212	data, ii->data_size, DrRes);
				1213
				1214	if (CLG_(current_state).collect) {
				1215	ULong *cost_Dr;
				1216
				1217	if (CLG_(current_state).nonskipped) {
				1218	cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
				1219	}
				1220	else {
				1221	cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
				1222	}
				1223
				1224	inc_costs(DrRes, cost_Dr,
				1225	CLG_(current_state).cost + CLG_(sets).off_full_Dr );
				1226	}
				1227	}
				1228
				1229
				1230	/* Instruction doing a write access */
				1231
				1232	VG_REGPARM(2)
				1233	static void log_1I1Dw(InstrInfo* ii, Addr data)
				1234	{
				1235	CacheModelResult IrRes, DwRes;
				1236
				1237	current_ii = ii;
				1238	IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
				1239	DwRes = (*simulator.D1_Write)(data, ii->data_size);
				1240
				1241	CLG_DEBUG(6, "log_1I1Dw: Ir=%p/%u, Dw=%p/%u => Ir %d, Dw %d\n",
				1242	bb_base + ii->instr_offset, ii->instr_size,
				1243	data, ii->data_size, IrRes, DwRes);
				1244
				1245	if (CLG_(current_state).collect) {
				1246	ULong cost_Ir, cost_Dw;
				1247
				1248	if (CLG_(current_state).nonskipped) {
				1249	cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
				1250	cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
				1251	}
				1252	else {
				1253	cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
				1254	cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
				1255	}
				1256
				1257	inc_costs(IrRes, cost_Ir,
				1258	CLG_(current_state).cost + CLG_(sets).off_full_Ir );
				1259	inc_costs(DwRes, cost_Dw,
				1260	CLG_(current_state).cost + CLG_(sets).off_full_Dw );
				1261	}
				1262	}
				1263
				1264	VG_REGPARM(2)
				1265	static void log_0I1Dw(InstrInfo* ii, Addr data)
				1266	{
				1267	CacheModelResult DwRes;
				1268
				1269	current_ii = ii;
				1270	DwRes = (*simulator.D1_Write)(data, ii->data_size);
				1271
				1272	CLG_DEBUG(6, "log_0I1Dw: Dw=%p/%u => Dw %d\n",
				1273	data, ii->data_size, DwRes);
				1274
				1275	if (CLG_(current_state).collect) {
				1276	ULong *cost_Dw;
				1277
				1278	if (CLG_(current_state).nonskipped) {
				1279	cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
				1280	}
				1281	else {
				1282	cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
				1283	}
				1284
				1285	inc_costs(DwRes, cost_Dw,
				1286	CLG_(current_state).cost + CLG_(sets).off_full_Dw );
				1287	}
				1288	}
				1289
				1290	/* Instruction doing a read and a write access */
				1291
				1292	VG_REGPARM(3)
				1293	static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
				1294	{
				1295	CacheModelResult IrRes, DrRes, DwRes;
				1296
				1297	current_ii = ii;
				1298	IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
				1299	DrRes = (*simulator.D1_Read)(data1, ii->data_size);
				1300	DwRes = (*simulator.D1_Write)(data2, ii->data_size);
				1301
				1302	CLG_DEBUG(6,
				1303	"log_1I2D: Ir=%p/%u, Dr=%p/%u, Dw=%p/%u => Ir %d, Dr %d, Dw %d\n",
				1304	bb_base + ii->instr_offset, ii->instr_size,
				1305	data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
				1306
				1307	if (CLG_(current_state).collect) {
				1308	ULong cost_Ir, cost_Dr, *cost_Dw;
				1309
				1310	if (CLG_(current_state).nonskipped) {
				1311	cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
				1312	cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
				1313	cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
				1314	}
				1315	else {
				1316	cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
				1317	cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
				1318	cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
				1319	}
				1320
				1321	inc_costs(IrRes, cost_Ir,
				1322	CLG_(current_state).cost + CLG_(sets).off_full_Ir );
				1323	inc_costs(DrRes, cost_Dr,
				1324	CLG_(current_state).cost + CLG_(sets).off_full_Dr );
				1325	inc_costs(DwRes, cost_Dw,
				1326	CLG_(current_state).cost + CLG_(sets).off_full_Dw );
				1327	}
				1328	}
				1329
				1330	VG_REGPARM(3)
				1331	static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
				1332	{
				1333	CacheModelResult DrRes, DwRes;
				1334
				1335	current_ii = ii;
				1336	DrRes = (*simulator.D1_Read)(data1, ii->data_size);
				1337	DwRes = (*simulator.D1_Write)(data2, ii->data_size);
				1338
				1339	CLG_DEBUG(6,
				1340	"log_0D2D: Dr=%p/%u, Dw=%p/%u => Dr %d, Dw %d\n",
				1341	data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
				1342
				1343	if (CLG_(current_state).collect) {
				1344	ULong cost_Dr, cost_Dw;
				1345
				1346	if (CLG_(current_state).nonskipped) {
				1347	cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
				1348	cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
				1349	}
				1350	else {
				1351	cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
				1352	cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
				1353	}
				1354
				1355	inc_costs(DrRes, cost_Dr,
				1356	CLG_(current_state).cost + CLG_(sets).off_full_Dr );
				1357	inc_costs(DwRes, cost_Dw,
				1358	CLG_(current_state).cost + CLG_(sets).off_full_Dw );
				1359	}
				1360	}
				1361
				1362
				1363	/------------------------------------------------------------/
				1364	/--- Cache configuration ---/
				1365	/------------------------------------------------------------/
				1366
				1367	#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
				1368
				1369	static cache_t clo_I1_cache = UNDEFINED_CACHE;
				1370	static cache_t clo_D1_cache = UNDEFINED_CACHE;
				1371	static cache_t clo_L2_cache = UNDEFINED_CACHE;
				1372
				1373
				1374	/* Checks cache config is ok; makes it so if not. */
				1375	static
				1376	void check_cache(cache_t* cache, Char *name)
				1377	{
				1378	/* First check they're all powers of two */
				1379	if (-1 == VG_(log2)(cache->size)) {
				1380	VG_(message)(Vg_UserMsg,
				1381	"error: %s size of %dB not a power of two; aborting.",
				1382	name, cache->size);
				1383	VG_(exit)(1);
				1384	}
				1385
				1386	if (-1 == VG_(log2)(cache->assoc)) {
				1387	VG_(message)(Vg_UserMsg,
				1388	"error: %s associativity of %d not a power of two; aborting.",
				1389	name, cache->assoc);
				1390	VG_(exit)(1);
				1391	}
				1392
				1393	if (-1 == VG_(log2)(cache->line_size)) {
				1394	VG_(message)(Vg_UserMsg,
				1395	"error: %s line size of %dB not a power of two; aborting.",
				1396	name, cache->line_size);
				1397	VG_(exit)(1);
				1398	}
				1399
				1400	// Then check line size >= 16 -- any smaller and a single instruction could
				1401	// straddle three cache lines, which breaks a simulation assertion and is
				1402	// stupid anyway.
				1403	if (cache->line_size < MIN_LINE_SIZE) {
				1404	VG_(message)(Vg_UserMsg,
				1405	"error: %s line size of %dB too small; aborting.",
				1406	name, cache->line_size);
				1407	VG_(exit)(1);
				1408	}
				1409
				1410	/* Then check cache size > line size (causes seg faults if not). */
				1411	if (cache->size <= cache->line_size) {
				1412	VG_(message)(Vg_UserMsg,
				1413	"error: %s cache size of %dB <= line size of %dB; aborting.",
				1414	name, cache->size, cache->line_size);
				1415	VG_(exit)(1);
				1416	}
				1417
				1418	/* Then check assoc <= (size / line size) (seg faults otherwise). */
				1419	if (cache->assoc > (cache->size / cache->line_size)) {
				1420	VG_(message)(Vg_UserMsg,
				1421	"warning: %s associativity > (size / line size); aborting.", name);
				1422	VG_(exit)(1);
				1423	}
				1424	}
				1425
				1426	static
				1427	void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
				1428	{
				1429	#define DEFINED(L) (-1 != L.size \|\| -1 != L.assoc \|\| -1 != L.line_size)
				1430
				1431	Int n_clos = 0;
				1432
				1433	// Count how many were defined on the command line.
				1434	if (DEFINED(clo_I1_cache)) { n_clos++; }
				1435	if (DEFINED(clo_D1_cache)) { n_clos++; }
				1436	if (DEFINED(clo_L2_cache)) { n_clos++; }
				1437
				1438	// Set the cache config (using auto-detection, if supported by the
				1439	// architecture)
				1440	VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
				1441
				1442	// Then replace with any defined on the command line.
				1443	if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
				1444	if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
				1445	if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
				1446
				1447	// Then check values and fix if not acceptable.
				1448	check_cache(I1c, "I1");
				1449	check_cache(D1c, "D1");
				1450	check_cache(L2c, "L2");
				1451
				1452	if (VG_(clo_verbosity) > 1) {
				1453	VG_(message)(Vg_UserMsg, "Cache configuration used:");
				1454	VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines",
				1455	I1c->size, I1c->assoc, I1c->line_size);
				1456	VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines",
				1457	D1c->size, D1c->assoc, D1c->line_size);
				1458	VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines",
				1459	L2c->size, L2c->assoc, L2c->line_size);
				1460	}
				1461	#undef CMD_LINE_DEFINED
				1462	}
				1463
				1464
				1465	/* Initialize and clear simulator state */
				1466	static void cachesim_post_clo_init(void)
				1467	{
				1468	/* Cache configurations. */
				1469	cache_t I1c, D1c, L2c;
				1470
				1471	/* Initialize access handlers */
				1472	if (!CLG_(clo).simulate_cache) {
				1473	CLG_(cachesim).log_1I0D = 0;
				1474	CLG_(cachesim).log_1I0D_name = "(no function)";
				1475
				1476	CLG_(cachesim).log_1I1Dr = 0;
				1477	CLG_(cachesim).log_1I1Dw = 0;
				1478	CLG_(cachesim).log_1I2D = 0;
				1479	CLG_(cachesim).log_1I1Dr_name = "(no function)";
				1480	CLG_(cachesim).log_1I1Dw_name = "(no function)";
				1481	CLG_(cachesim).log_1I2D_name = "(no function)";
				1482
				1483	CLG_(cachesim).log_0I1Dr = 0;
				1484	CLG_(cachesim).log_0I1Dw = 0;
				1485	CLG_(cachesim).log_0I2D = 0;
				1486	CLG_(cachesim).log_0I1Dr_name = "(no function)";
				1487	CLG_(cachesim).log_0I1Dw_name = "(no function)";
				1488	CLG_(cachesim).log_0I2D_name = "(no function)";
				1489	return;
				1490	}
				1491
				1492	/* Configuration of caches only needed with real cache simulation */
				1493	configure_caches(&I1c, &D1c, &L2c);
				1494
				1495	I1.name = "I1";
				1496	D1.name = "D1";
				1497	L2.name = "L2";
				1498
				1499	cachesim_initcache(I1c, &I1);
				1500	cachesim_initcache(D1c, &D1);
				1501	cachesim_initcache(L2c, &L2);
				1502
				1503	/* the other cache simulators use the standard helpers
				1504	* with dispatching via simulator struct */
				1505
				1506	CLG_(cachesim).log_1I0D = log_1I0D;
				1507	CLG_(cachesim).log_1I0D_name = "log_1I0D";
				1508
				1509	CLG_(cachesim).log_1I1Dr = log_1I1Dr;
				1510	CLG_(cachesim).log_1I1Dw = log_1I1Dw;
				1511	CLG_(cachesim).log_1I2D = log_1I2D;
				1512	CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
				1513	CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
				1514	CLG_(cachesim).log_1I2D_name = "log_1I2D";
				1515
				1516	CLG_(cachesim).log_0I1Dr = log_0I1Dr;
				1517	CLG_(cachesim).log_0I1Dw = log_0I1Dw;
				1518	CLG_(cachesim).log_0I2D = log_0I2D;
				1519	CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
				1520	CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
				1521	CLG_(cachesim).log_0I2D_name = "log_0I2D";
				1522
				1523	if (clo_collect_cacheuse) {
				1524
				1525	/* Output warning for not supported option combinations */
				1526	if (clo_simulate_hwpref) {
				1527	VG_(message)(Vg_DebugMsg,
				1528	"warning: prefetch simulation can not be used with cache usage");
				1529	clo_simulate_hwpref = False;
				1530	}
				1531
				1532	if (clo_simulate_writeback) {
				1533	VG_(message)(Vg_DebugMsg,
				1534	"warning: write-back simulation can not be used with cache usage");
				1535	clo_simulate_writeback = False;
				1536	}
				1537
				1538	simulator.I1_Read = cacheuse_I1_doRead;
				1539	simulator.D1_Read = cacheuse_D1_doRead;
				1540	simulator.D1_Write = cacheuse_D1_doRead;
				1541	return;
				1542	}
				1543
				1544	if (clo_simulate_hwpref) {
				1545	prefetch_clear();
				1546
				1547	if (clo_simulate_writeback) {
				1548	simulator.I1_Read = prefetch_I1_Read;
				1549	simulator.D1_Read = prefetch_D1_Read;
				1550	simulator.D1_Write = prefetch_D1_Write;
				1551	}
				1552	else {
				1553	simulator.I1_Read = prefetch_I1_ref;
				1554	simulator.D1_Read = prefetch_D1_ref;
				1555	simulator.D1_Write = prefetch_D1_ref;
				1556	}
				1557
				1558	return;
				1559	}
				1560
				1561	if (clo_simulate_writeback) {
				1562	simulator.I1_Read = cachesim_I1_Read;
				1563	simulator.D1_Read = cachesim_D1_Read;
				1564	simulator.D1_Write = cachesim_D1_Write;
				1565	}
				1566	else {
				1567	simulator.I1_Read = cachesim_I1_ref;
				1568	simulator.D1_Read = cachesim_D1_ref;
				1569	simulator.D1_Write = cachesim_D1_ref;
				1570	}
				1571	}
				1572
				1573
				1574	/* Clear simulator state. Has to be initialized before */
				1575	static
				1576	void cachesim_clear(void)
				1577	{
				1578	cachesim_clearcache(&I1);
				1579	cachesim_clearcache(&D1);
				1580	cachesim_clearcache(&L2);
				1581
				1582	prefetch_clear();
				1583	}
				1584
				1585
				1586	static void cachesim_getdesc(Char* buf)
				1587	{
				1588	Int p;
				1589	p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
				1590	p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
				1591	VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
				1592	}
				1593
				1594	static
				1595	void cachesim_print_opts(void)
				1596	{
				1597	VG_(printf)(
				1598	"\n cache simulator options:\n"
				1599	" --simulate-cache=no\|yes Do cache simulation [no]\n"
				1600	" --simulate-wb=no\|yes Count write-back events [no]\n"
				1601	" --simulate-hwpref=no\|yes Simulate hardware prefetch [no]\n"
				1602	#if CLG_EXPERIMENTAL
				1603	" --simulate-sectors=no\|yes Simulate sectored behaviour [no]\n"
				1604	#endif
				1605	" --cacheuse=no\|yes Collect cache block use [no]\n"
				1606	" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
				1607	" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
				1608	" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
				1609	);
				1610	}
				1611
				1612	static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
				1613	{
				1614	int i1, i2, i3;
				1615	int i;
				1616	char *opt = VG_(strdup)(orig_opt);
				1617
				1618	i = i1 = opt_len;
				1619
				1620	/* Option looks like "--I1=65536,2,64".
				1621	* Find commas, replace with NULs to make three independent
				1622	* strings, then extract numbers. Yuck. */
				1623	while (VG_(isdigit)(opt[i])) i++;
				1624	if (',' == opt[i]) {
				1625	opt[i++] = '\0';
				1626	i2 = i;
				1627	} else goto bad;
				1628	while (VG_(isdigit)(opt[i])) i++;
				1629	if (',' == opt[i]) {
				1630	opt[i++] = '\0';
				1631	i3 = i;
				1632	} else goto bad;
				1633	while (VG_(isdigit)(opt[i])) i++;
				1634	if ('\0' != opt[i]) goto bad;
				1635
				1636	cache->size = (Int)VG_(atoll)(opt + i1);
				1637	cache->assoc = (Int)VG_(atoll)(opt + i2);
				1638	cache->line_size = (Int)VG_(atoll)(opt + i3);
				1639
				1640	VG_(free)(opt);
				1641
				1642	return;
				1643
				1644	bad:
sewardj	6893d65	2006-10-15 01:25:13 +0000	[diff] [blame]	1645	VG_(err_bad_option)(orig_opt);
weidendo	a17f2a3	2006-03-20 10:27:30 +0000	[diff] [blame]	1646	}
				1647
				1648	/* Check for command line option for cache configuration.
				1649	* Return False if unknown and not handled.
				1650	*
				1651	* Called from CLG_(process_cmd_line_option)() in clo.c
				1652	*/
				1653	static Bool cachesim_parse_opt(Char* arg)
				1654	{
				1655	if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
				1656	clo_simulate_writeback = True;
				1657	else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
				1658	clo_simulate_writeback = False;
				1659
				1660	else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
				1661	clo_simulate_hwpref = True;
				1662	else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
				1663	clo_simulate_hwpref = False;
				1664
				1665	else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
				1666	clo_simulate_sectors = True;
				1667	else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
				1668	clo_simulate_sectors = False;
				1669
				1670	else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
				1671	clo_collect_cacheuse = True;
				1672	/* Use counters only make sense with fine dumping */
				1673	CLG_(clo).dump_instr = True;
				1674	}
				1675	else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
				1676	clo_collect_cacheuse = False;
				1677
				1678	/* 5 is length of "--I1=" */
				1679	else if (0 == VG_(strncmp)(arg, "--I1=", 5))
				1680	parse_opt(&clo_I1_cache, arg, 5);
				1681	else if (0 == VG_(strncmp)(arg, "--D1=", 5))
				1682	parse_opt(&clo_D1_cache, arg, 5);
				1683	else if (0 == VG_(strncmp)(arg, "--L2=", 5))
				1684	parse_opt(&clo_L2_cache, arg, 5);
				1685	else
				1686	return False;
				1687
				1688	return True;
				1689	}
				1690
				1691	/* Adds commas to ULong, right justifying in a field field_width wide, returns
				1692	* the string in buf. */
				1693	static
				1694	Int commify(ULong n, int field_width, char* buf)
				1695	{
				1696	int len, n_commas, i, j, new_len, space;
				1697
				1698	VG_(sprintf)(buf, "%llu", n);
				1699	len = VG_(strlen)(buf);
				1700	n_commas = (len - 1) / 3;
				1701	new_len = len + n_commas;
				1702	space = field_width - new_len;
				1703
				1704	/* Allow for printing a number in a field_width smaller than it's size */
				1705	if (space < 0) space = 0;
				1706
				1707	/* Make j = -1 because we copy the '\0' before doing the numbers in groups
				1708	* of three. */
				1709	for (j = -1, i = len ; i >= 0; i--) {
				1710	buf[i + n_commas + space] = buf[i];
				1711
				1712	if ((i>0) && (3 == ++j)) {
				1713	j = 0;
				1714	n_commas--;
				1715	buf[i + n_commas + space] = ',';
				1716	}
				1717	}
				1718	/* Right justify in field. */
				1719	for (i = 0; i < space; i++) buf[i] = ' ';
				1720	return new_len;
				1721	}
				1722
				1723	static
				1724	void percentify(Int n, Int ex, Int field_width, char buf[])
				1725	{
				1726	int i, len, space;
				1727
				1728	VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
				1729	len = VG_(strlen)(buf);
				1730	space = field_width - len;
				1731	if (space < 0) space = 0; /* Allow for v. small field_width */
				1732	i = len;
				1733
				1734	/* Right justify in field */
				1735	for ( ; i >= 0; i--) buf[i + space] = buf[i];
				1736	for (i = 0; i < space; i++) buf[i] = ' ';
				1737	}
				1738
				1739	static
				1740	void cachesim_printstat(void)
				1741	{
				1742	FullCost total = CLG_(total_cost), D_total = 0;
				1743	ULong L2_total_m, L2_total_mr, L2_total_mw,
				1744	L2_total, L2_total_r, L2_total_w;
				1745	char buf1[RESULTS_BUF_LEN],
				1746	buf2[RESULTS_BUF_LEN],
				1747	buf3[RESULTS_BUF_LEN];
				1748	Int l1, l2, l3;
				1749	Int p;
				1750
				1751	if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
				1752	VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu",
				1753	prefetch_up);
				1754	VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu",
				1755	prefetch_down);
				1756	VG_(message)(Vg_DebugMsg, "");
				1757	}
				1758
				1759	/* I cache results. Use the I_refs value to determine the first column
				1760	* width. */
				1761	l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
				1762	VG_(message)(Vg_UserMsg, "I refs: %s", buf1);
				1763
				1764	if (!CLG_(clo).simulate_cache) return;
				1765
				1766	commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
				1767	VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1);
				1768
				1769	commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
				1770	VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1);
				1771
				1772	p = 100;
				1773
				1774	if (0 == total[CLG_(sets).off_full_Ir])
				1775	total[CLG_(sets).off_full_Ir] = 1;
				1776
				1777	percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
				1778	total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
				1779	VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
				1780
				1781	percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
				1782	total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
				1783	VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
				1784	VG_(message)(Vg_UserMsg, "");
				1785
				1786	/* D cache results.
				1787	Use the D_refs.rd and D_refs.wr values to determine the
				1788	* width of columns 2 & 3. */
				1789
				1790	D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
				1791	CLG_(init_cost)( CLG_(sets).full, D_total);
				1792	CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
				1793	CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
				1794
				1795	commify( D_total[0], l1, buf1);
				1796	l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2);
				1797	l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3);
				1798	VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)",
				1799	buf1, buf2, buf3);
				1800
				1801	commify( D_total[1], l1, buf1);
				1802	commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
				1803	commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
				1804	VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)",
				1805	buf1, buf2, buf3);
				1806
				1807	commify( D_total[2], l1, buf1);
				1808	commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
				1809	commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
				1810	VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)",
				1811	buf1, buf2, buf3);
				1812
				1813	p = 10;
				1814
				1815	if (0 == D_total[0]) D_total[0] = 1;
				1816	if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
				1817	if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
				1818
				1819	percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
				1820	percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
				1821	total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
				1822	percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
				1823	total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
				1824	VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
				1825
				1826	percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
				1827	percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
				1828	total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
				1829	percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
				1830	total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
				1831	VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
				1832	VG_(message)(Vg_UserMsg, "");
				1833
				1834
				1835
				1836	/* L2 overall results */
				1837
				1838	L2_total =
				1839	total[CLG_(sets).off_full_Dr +1] +
				1840	total[CLG_(sets).off_full_Dw +1] +
				1841	total[CLG_(sets).off_full_Ir +1];
				1842	L2_total_r =
				1843	total[CLG_(sets).off_full_Dr +1] +
				1844	total[CLG_(sets).off_full_Ir +1];
				1845	L2_total_w = total[CLG_(sets).off_full_Dw +1];
				1846	commify(L2_total, l1, buf1);
				1847	commify(L2_total_r, l2, buf2);
				1848	commify(L2_total_w, l3, buf3);
				1849	VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)",
				1850	buf1, buf2, buf3);
				1851
				1852	L2_total_m =
				1853	total[CLG_(sets).off_full_Dr +2] +
				1854	total[CLG_(sets).off_full_Dw +2] +
				1855	total[CLG_(sets).off_full_Ir +2];
				1856	L2_total_mr =
				1857	total[CLG_(sets).off_full_Dr +2] +
				1858	total[CLG_(sets).off_full_Ir +2];
				1859	L2_total_mw = total[CLG_(sets).off_full_Dw +2];
				1860	commify(L2_total_m, l1, buf1);
				1861	commify(L2_total_mr, l2, buf2);
				1862	commify(L2_total_mw, l3, buf3);
				1863	VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)",
				1864	buf1, buf2, buf3);
				1865
				1866	percentify(L2_total_m * 100 * p /
				1867	(total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1);
				1868	percentify(L2_total_mr * 100 * p /
				1869	(total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
				1870	p, l2+1, buf2);
				1871	percentify(L2_total_mw * 100 * p /
				1872	total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
				1873	VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )",
				1874	buf1, buf2,buf3);
				1875	}
				1876
				1877
				1878	/------------------------------------------------------------/
				1879	/--- Setup for Event set. ---/
				1880	/------------------------------------------------------------/
				1881
				1882	struct event_sets CLG_(sets);
				1883
				1884	void CLG_(init_eventsets)(Int max_user)
				1885	{
				1886	EventType * e1, e2, e3, *e4;
				1887	EventSet Ir, Dr, *Dw;
				1888	EventSet D0, D1r, D1w, D2;
				1889	EventSet sim, full;
				1890	EventSet *use;
				1891	int sizeOfUseIr;
				1892
				1893	use = CLG_(get_eventset)("Use", 4);
				1894	if (clo_collect_cacheuse) {
				1895	/* if TUse is 0, there was never a load, and no loss, too */
				1896	e1 = CLG_(register_eventtype)("AcCost1");
				1897	CLG_(add_eventtype)(use, e1);
				1898	e1 = CLG_(register_eventtype)("SpLoss1");
				1899	CLG_(add_eventtype)(use, e1);
				1900	e1 = CLG_(register_eventtype)("AcCost2");
				1901	CLG_(add_eventtype)(use, e1);
				1902	e1 = CLG_(register_eventtype)("SpLoss2");
				1903	CLG_(add_eventtype)(use, e1);
				1904	}
				1905
				1906	Ir = CLG_(get_eventset)("Ir", 4);
				1907	Dr = CLG_(get_eventset)("Dr", 4);
				1908	Dw = CLG_(get_eventset)("Dw", 4);
				1909	if (CLG_(clo).simulate_cache) {
				1910	e1 = CLG_(register_eventtype)("Ir");
				1911	e2 = CLG_(register_eventtype)("I1mr");
				1912	e3 = CLG_(register_eventtype)("I2mr");
				1913	if (clo_simulate_writeback) {
				1914	e4 = CLG_(register_eventtype)("I2dmr");
				1915	CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
				1916	}
				1917	else
				1918	CLG_(add_dep_event3)(Ir, e1,e2,e3);
				1919
				1920	e1 = CLG_(register_eventtype)("Dr");
				1921	e2 = CLG_(register_eventtype)("D1mr");
				1922	e3 = CLG_(register_eventtype)("D2mr");
				1923	if (clo_simulate_writeback) {
				1924	e4 = CLG_(register_eventtype)("D2dmr");
				1925	CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
				1926	}
				1927	else
				1928	CLG_(add_dep_event3)(Dr, e1,e2,e3);
				1929
				1930	e1 = CLG_(register_eventtype)("Dw");
				1931	e2 = CLG_(register_eventtype)("D1mw");
				1932	e3 = CLG_(register_eventtype)("D2mw");
				1933	if (clo_simulate_writeback) {
				1934	e4 = CLG_(register_eventtype)("D2dmw");
				1935	CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
				1936	}
				1937	else
				1938	CLG_(add_dep_event3)(Dw, e1,e2,e3);
				1939
				1940	}
				1941	else {
				1942	e1 = CLG_(register_eventtype)("Ir");
				1943	CLG_(add_eventtype)(Ir, e1);
				1944	}
				1945
				1946	sizeOfUseIr = use->size + Ir->size;
				1947	D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
				1948	CLG_(add_eventset)(D0, use);
				1949	off_D0_Ir = CLG_(add_eventset)(D0, Ir);
				1950
				1951	D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
				1952	CLG_(add_eventset)(D1r, use);
				1953	off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
				1954	off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
				1955
				1956	D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
				1957	CLG_(add_eventset)(D1w, use);
				1958	off_D1w_Ir = CLG_(add_eventset)(D1w, Ir);
				1959	off_D1w_Dw = CLG_(add_eventset)(D1w, Dw);
				1960
				1961	D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
				1962	CLG_(add_eventset)(D2, use);
				1963	off_D2_Ir = CLG_(add_eventset)(D2, Ir);
				1964	off_D2_Dr = CLG_(add_eventset)(D2, Dr);
				1965	off_D2_Dw = CLG_(add_eventset)(D2, Dw);
				1966
				1967	sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
				1968	CLG_(add_eventset)(sim, use);
				1969	CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir);
				1970	CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr);
				1971	CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw);
				1972
				1973	if (CLG_(clo).collect_alloc) max_user += 2;
				1974	if (CLG_(clo).collect_systime) max_user += 2;
				1975
				1976	full = CLG_(get_eventset)("full", sim->size + max_user);
				1977	CLG_(add_eventset)(full, sim);
				1978	CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir;
				1979	CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr;
				1980	CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw;
				1981
				1982	CLG_(sets).use = use;
				1983	CLG_(sets).Ir = Ir;
				1984	CLG_(sets).Dr = Dr;
				1985	CLG_(sets).Dw = Dw;
				1986
				1987	CLG_(sets).D0 = D0;
				1988	CLG_(sets).D1r = D1r;
				1989	CLG_(sets).D1w = D1w;
				1990	CLG_(sets).D2 = D2;
				1991
				1992	CLG_(sets).sim = sim;
				1993	CLG_(sets).full = full;
				1994
				1995	if (CLG_(clo).collect_alloc) {
				1996	e1 = CLG_(register_eventtype)("allocCount");
				1997	e2 = CLG_(register_eventtype)("allocSize");
				1998	CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2);
				1999	}
				2000
				2001	if (CLG_(clo).collect_systime) {
				2002	e1 = CLG_(register_eventtype)("sysCount");
				2003	e2 = CLG_(register_eventtype)("sysTime");
				2004	CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2);
				2005	}
				2006
				2007	CLG_DEBUGIF(1) {
				2008	CLG_DEBUG(1, "EventSets:\n");
				2009	CLG_(print_eventset)(-2, use);
				2010	CLG_(print_eventset)(-2, Ir);
				2011	CLG_(print_eventset)(-2, Dr);
				2012	CLG_(print_eventset)(-2, Dw);
				2013	CLG_(print_eventset)(-2, sim);
				2014	CLG_(print_eventset)(-2, full);
				2015	}
				2016
				2017	/* Not-existing events are silently ignored */
				2018	CLG_(dumpmap) = CLG_(get_eventmapping)(full);
				2019	CLG_(append_event)(CLG_(dumpmap), "Ir");
				2020	CLG_(append_event)(CLG_(dumpmap), "Dr");
				2021	CLG_(append_event)(CLG_(dumpmap), "Dw");
				2022	CLG_(append_event)(CLG_(dumpmap), "I1mr");
				2023	CLG_(append_event)(CLG_(dumpmap), "D1mr");
				2024	CLG_(append_event)(CLG_(dumpmap), "D1mw");
				2025	CLG_(append_event)(CLG_(dumpmap), "I2mr");
				2026	CLG_(append_event)(CLG_(dumpmap), "D2mr");
				2027	CLG_(append_event)(CLG_(dumpmap), "D2mw");
				2028	CLG_(append_event)(CLG_(dumpmap), "I2dmr");
				2029	CLG_(append_event)(CLG_(dumpmap), "D2dmr");
				2030	CLG_(append_event)(CLG_(dumpmap), "D2dmw");
				2031	CLG_(append_event)(CLG_(dumpmap), "AcCost1");
				2032	CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
				2033	CLG_(append_event)(CLG_(dumpmap), "AcCost2");
				2034	CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
				2035	CLG_(append_event)(CLG_(dumpmap), "allocCount");
				2036	CLG_(append_event)(CLG_(dumpmap), "allocSize");
				2037	CLG_(append_event)(CLG_(dumpmap), "sysCount");
				2038	CLG_(append_event)(CLG_(dumpmap), "sysTime");
				2039
				2040	}
				2041
				2042
				2043
				2044	static
				2045	void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
				2046	{
				2047	/* if eventset use is defined, it is always first (hardcoded!) */
				2048	CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
				2049
				2050	/* FIXME: This is hardcoded... */
				2051	if (es == CLG_(sets).D0) {
				2052	CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
				2053	cost + off_D0_Ir);
				2054	}
				2055	else if (es == CLG_(sets).D1r) {
				2056	CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
				2057	cost + off_D1r_Ir);
				2058	CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
				2059	cost + off_D1r_Dr);
				2060	}
				2061	else if (es == CLG_(sets).D1w) {
				2062	CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
				2063	cost + off_D1w_Ir);
				2064	CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
				2065	cost + off_D1w_Dw);
				2066	}
				2067	else {
				2068	CLG_ASSERT(es == CLG_(sets).D2);
				2069	CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
				2070	cost + off_D2_Ir);
				2071	CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
				2072	cost + off_D2_Dr);
				2073	CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
				2074	cost + off_D2_Dw);
				2075	}
				2076	}
				2077
				2078	/* this is called at dump time for every instruction executed */
				2079	static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
				2080	InstrInfo* ii, ULong exe_count)
				2081	{
				2082	if (!CLG_(clo).simulate_cache)
				2083	cost[CLG_(sets).off_sim_Ir] += exe_count;
				2084	else {
				2085
				2086	#if 0
				2087	/* There is always a trivial case where exe_count and Ir can be
				2088	* slightly different because ecounter is updated when executing
				2089	* the next BB. E.g. for last BB executed, or when toggling collection
				2090	*/
				2091	/* FIXME: Hardcoded that each eventset has Ir as first */
				2092	if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
				2093	VG_(printf)("==> Ir %llu, exe %llu\n",
				2094	(bbcc->cost + ii->cost_offset)[0], exe_count);
				2095	CLG_(print_bbcc_cost)(-2, bbcc);
				2096	//CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
				2097	}
				2098	#endif
				2099
				2100	add_and_zero_Dx(ii->eventset, cost,
				2101	bbcc->cost + ii->cost_offset);
				2102	}
				2103	}
				2104
				2105	static
				2106	void cachesim_after_bbsetup(void)
				2107	{
				2108	BBCC* bbcc = CLG_(current_state).bbcc;
				2109
				2110	if (CLG_(clo).simulate_cache) {
				2111	BB* bb = bbcc->bb;
				2112
				2113	/* only needed if log_* functions are called */
				2114	bb_base = bb->obj->offset + bb->offset;
				2115	cost_base = bbcc->cost;
				2116	}
				2117	}
				2118
				2119	static
				2120	void cachesim_finish(void)
				2121	{
				2122	if (clo_collect_cacheuse)
				2123	cacheuse_finish();
				2124	}
				2125
				2126	/------------------------------------------------------------/
				2127	/--- The simulator defined in this file ---/
				2128	/------------------------------------------------------------/
				2129
				2130	struct cachesim_if CLG_(cachesim) = {
				2131	.print_opts = cachesim_print_opts,
				2132	.parse_opt = cachesim_parse_opt,
				2133	.post_clo_init = cachesim_post_clo_init,
				2134	.clear = cachesim_clear,
				2135	.getdesc = cachesim_getdesc,
				2136	.printstat = cachesim_printstat,
				2137	.add_icost = cachesim_add_icost,
				2138	.after_bbsetup = cachesim_after_bbsetup,
				2139	.finish = cachesim_finish,
				2140
				2141	/* these will be set by cachesim_post_clo_init */
				2142	.log_1I0D = 0,
				2143
				2144	.log_1I1Dr = 0,
				2145	.log_1I1Dw = 0,
				2146	.log_1I2D = 0,
				2147
				2148	.log_0I1Dr = 0,
				2149	.log_0I1Dw = 0,
				2150	.log_0I2D = 0,
				2151
				2152	.log_1I0D_name = "(no function)",
				2153
				2154	.log_1I1Dr_name = "(no function)",
				2155	.log_1I1Dw_name = "(no function)",
				2156	.log_1I2D_name = "(no function)",
				2157
				2158	.log_0I1Dr_name = "(no function)",
				2159	.log_0I1Dw_name = "(no function)",
				2160	.log_0I2D_name = "(no function)"
				2161	};
				2162
				2163
				2164	/--------------------------------------------------------------------/
				2165	/--- end ct_sim.c ---/
				2166	/--------------------------------------------------------------------/
				2167