blob: 622fd29e846c496659fc9db3587d1b753d82e29b [file] [log] [blame]
Derek Bruening88639852016-05-25 02:04:04 +00001//===-- working_set.cpp ---------------------------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file is a part of EfficiencySanitizer, a family of performance tuners.
11//
12// This file contains working-set-specific code.
13//===----------------------------------------------------------------------===//
14
15#include "working_set.h"
16#include "esan.h"
Derek Brueninge78e4a62016-06-03 16:27:50 +000017#include "esan_circular_buffer.h"
Derek Bruening88639852016-05-25 02:04:04 +000018#include "esan_flags.h"
19#include "esan_shadow.h"
Derek Bruening07814762016-06-03 16:14:07 +000020#include "esan_sideline.h"
Derek Bruening8ef3f0f2016-05-31 13:41:07 +000021#include "sanitizer_common/sanitizer_procmaps.h"
Derek Bruening88639852016-05-25 02:04:04 +000022
23// We shadow every cache line of app memory with one shadow byte.
24// - The highest bit of each shadow byte indicates whether the corresponding
25// cache line has ever been accessed.
26// - The lowest bit of each shadow byte indicates whether the corresponding
27// cache line was accessed since the last sample.
Derek Brueninge78e4a62016-06-03 16:27:50 +000028// - The other bits are used for working set snapshots at successively
29// lower frequencies, each bit to the left from the lowest bit stepping
30// down the frequency by 2 to the power of getFlags()->snapshot_step.
31// Thus we have something like this:
32// Bit 0: Since last sample
33// Bit 1: Since last 2^2 samples
34// Bit 2: Since last 2^4 samples
35// Bit 3: ...
36// Bit 7: Ever accessed.
Derek Bruening88639852016-05-25 02:04:04 +000037// We live with races in accessing each shadow byte.
38typedef unsigned char byte;
39
40namespace __esan {
41
Derek Bruening8ef3f0f2016-05-31 13:41:07 +000042// Our shadow memory assumes that the line size is 64.
43static const u32 CacheLineSize = 64;
44
Derek Bruening88639852016-05-25 02:04:04 +000045// See the shadow byte layout description above.
46static const u32 TotalWorkingSetBitIdx = 7;
Derek Brueninge78e4a62016-06-03 16:27:50 +000047// We accumulate to the left until we hit this bit.
48// We don't need to accumulate to the final bit as it's set on each ref
49// by the compiler instrumentation.
50static const u32 MaxAccumBitIdx = 6;
Derek Bruening88639852016-05-25 02:04:04 +000051static const u32 CurWorkingSetBitIdx = 0;
52static const byte ShadowAccessedVal =
53 (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);
54
Derek Bruening07814762016-06-03 16:14:07 +000055static SidelineThread Thread;
56// If we use real-time-based timer samples this won't overflow in any realistic
57// scenario, but if we switch to some other unit (such as memory accesses) we
58// may want to consider a 64-bit int.
59static u32 SnapshotNum;
60
Derek Brueninge78e4a62016-06-03 16:27:50 +000061// We store the wset size for each of 8 different sampling frequencies.
62static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
63// We cannot use static objects as the global destructor is called
64// prior to our finalize routine.
65// These are each circular buffers, sized up front.
66CircularBuffer<u32> SizePerFreq[NumFreq];
67// We cannot rely on static initializers (they may run too late) but
68// we record the size here for clarity:
69u32 CircularBufferSizes[NumFreq] = {
70 // These are each mmap-ed so our minimum is one page.
71 32*1024,
72 16*1024,
73 8*1024,
74 4*1024,
75 4*1024,
76 4*1024,
77 4*1024,
78 4*1024,
79};
80
Derek Bruening88639852016-05-25 02:04:04 +000081void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
82 bool IsWrite) {
83 if (Size == 0)
84 return;
85 SIZE_T I = 0;
86 uptr LineSize = getFlags()->cache_line_size;
87 // As Addr+Size could overflow at the top of a 32-bit address space,
88 // we avoid the simpler formula that rounds the start and end.
89 SIZE_T NumLines = Size / LineSize +
90 // Add any extra at the start or end adding on an extra line:
91 (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
92 byte *Shadow = (byte *)appToShadow(Addr);
93 // Write shadow bytes until we're word-aligned.
94 while (I < NumLines && (uptr)Shadow % 4 != 0) {
95 if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
96 *Shadow |= ShadowAccessedVal;
97 ++Shadow;
98 ++I;
99 }
100 // Write whole shadow words at a time.
101 // Using a word-stride loop improves the runtime of a microbenchmark of
102 // memset calls by 10%.
103 u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
104 ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
105 while (I + 4 <= NumLines) {
106 if ((*(u32*)Shadow & WordValue) != WordValue)
107 *(u32*)Shadow |= WordValue;
108 Shadow += 4;
109 I += 4;
110 }
111 // Write any trailing shadow bytes.
112 while (I < NumLines) {
113 if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
114 *Shadow |= ShadowAccessedVal;
115 ++Shadow;
116 ++I;
117 }
118}
119
Derek Bruening8ef3f0f2016-05-31 13:41:07 +0000120// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
121static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
122 uptr ShadowEnd) {
123 u32 WorkingSetSize = 0;
124 u32 ByteValue = 0x1 << BitIdx;
125 u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
126 ByteValue << 24;
127 // Get word aligned start.
128 ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
Derek Brueninge78e4a62016-06-03 16:27:50 +0000129 bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
Derek Bruening8ef3f0f2016-05-31 13:41:07 +0000130 for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
131 if ((*Ptr & WordValue) != 0) {
132 byte *BytePtr = (byte *)Ptr;
133 for (u32 j = 0; j < sizeof(u32); ++j) {
134 if (BytePtr[j] & ByteValue) {
135 ++WorkingSetSize;
Derek Brueninge78e4a62016-06-03 16:27:50 +0000136 if (Accum) {
137 // Accumulate to the lower-frequency bit to the left.
138 BytePtr[j] |= (ByteValue << 1);
139 }
Derek Bruening8ef3f0f2016-05-31 13:41:07 +0000140 }
141 }
142 // Clear this bit from every shadow byte.
143 *Ptr &= ~WordValue;
144 }
145 }
146 return WorkingSetSize;
147}
148
149// Scan shadow memory to calculate the number of cache lines being accessed,
150// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
151// We also clear the lowest bits (most recent working set snapshot).
152static u32 computeWorkingSizeAndReset(u32 BitIdx) {
153 u32 WorkingSetSize = 0;
154 MemoryMappingLayout MemIter(true/*cache*/);
155 uptr Start, End, Prot;
156 while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/,
157 0/*file size*/, &Prot)) {
158 VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n",
159 __FUNCTION__, Start, End, Prot, isAppMem(Start),
160 isShadowMem(Start));
161 if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) {
162 VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End);
163 WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End);
164 }
165 }
166 return WorkingSetSize;
167}
168
Derek Bruening07814762016-06-03 16:14:07 +0000169// This is invoked from a signal handler but in a sideline thread doing nothing
170// else so it is a little less fragile than a typical signal handler.
171static void takeSample(void *Arg) {
Derek Brueninge78e4a62016-06-03 16:27:50 +0000172 u32 BitIdx = CurWorkingSetBitIdx;
173 u32 Freq = 1;
174 ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
175 while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
176 u32 NumLines = computeWorkingSizeAndReset(BitIdx);
177 VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
178 SnapshotNum, BitIdx, Freq, NumLines);
179 SizePerFreq[BitIdx].push_back(NumLines);
180 Freq = Freq << getFlags()->snapshot_step;
181 BitIdx++;
182 }
Derek Bruening07814762016-06-03 16:14:07 +0000183}
184
Derek Brueningf6f149d2016-06-14 15:15:38 +0000185// Initialization that must be done before any instrumented code is executed.
186void initializeShadowWorkingSet() {
Derek Bruening8ef3f0f2016-05-31 13:41:07 +0000187 CHECK(getFlags()->cache_line_size == CacheLineSize);
Derek Bruening8e74c102016-05-31 13:21:03 +0000188 registerMemoryFaultHandler();
Derek Brueningf6f149d2016-06-14 15:15:38 +0000189}
Derek Bruening07814762016-06-03 16:14:07 +0000190
Derek Brueningf6f149d2016-06-14 15:15:38 +0000191void initializeWorkingSet() {
Derek Brueninge78e4a62016-06-03 16:27:50 +0000192 if (getFlags()->record_snapshots) {
193 for (u32 i = 0; i < NumFreq; ++i)
194 SizePerFreq[i].initialize(CircularBufferSizes[i]);
Derek Bruening07814762016-06-03 16:14:07 +0000195 Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
Derek Brueninge78e4a62016-06-03 16:27:50 +0000196 }
197}
198
199static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
200 if (MilliSec > 600000) {
201 Unit = "min";
202 return MilliSec / 60000;
203 } else if (MilliSec > 10000) {
204 Unit = "sec";
205 return MilliSec / 1000;
206 } else {
207 Unit = "ms";
208 return MilliSec;
209 }
Derek Bruening88639852016-05-25 02:04:04 +0000210}
211
Derek Bruening8ef3f0f2016-05-31 13:41:07 +0000212static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
213 // We need a constant to avoid software divide support:
214 static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
215 static const u32 MegabyteCachelines = KilobyteCachelines << 10;
216
217 if (NumOfCachelines > 10 * MegabyteCachelines) {
218 Unit = "MB";
219 return NumOfCachelines / MegabyteCachelines;
220 } else if (NumOfCachelines > 10 * KilobyteCachelines) {
221 Unit = "KB";
222 return NumOfCachelines / KilobyteCachelines;
223 } else {
224 Unit = "Bytes";
225 return NumOfCachelines * CacheLineSize;
226 }
227}
228
Derek Bruening88639852016-05-25 02:04:04 +0000229int finalizeWorkingSet() {
Derek Brueninge78e4a62016-06-03 16:27:50 +0000230 const char *Unit;
231 if (getFlags()->record_snapshots) {
Derek Bruening07814762016-06-03 16:14:07 +0000232 Thread.joinThread();
Derek Brueninge78e4a62016-06-03 16:27:50 +0000233 u32 Freq = 1;
234 Report(" Total number of samples: %u\n", SnapshotNum);
235 for (u32 i = 0; i < NumFreq; ++i) {
236 u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
237 Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
238 // FIXME: report whether we wrapped around and thus whether we
239 // have data on the whole run or just the last N samples.
240 for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
241 u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
242 Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
243 SizePerFreq[i][j]);
244 }
245 Freq = Freq << getFlags()->snapshot_step;
246 SizePerFreq[i].free();
247 }
248 }
Derek Bruening07814762016-06-03 16:14:07 +0000249
Derek Bruening8ef3f0f2016-05-31 13:41:07 +0000250 // Get the working set size for the entire execution.
251 u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
Derek Bruening8ef3f0f2016-05-31 13:41:07 +0000252 u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
253 Report(" %s: the total working set size: %u %s (%u cache lines)\n",
254 SanitizerToolName, Size, Unit, NumOfCachelines);
Derek Bruening88639852016-05-25 02:04:04 +0000255 return 0;
256}
257
258} // namespace __esan