Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 1 | //===-- working_set.cpp ---------------------------------------------------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // This file is a part of EfficiencySanitizer, a family of performance tuners. |
| 11 | // |
| 12 | // This file contains working-set-specific code. |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "working_set.h" |
| 16 | #include "esan.h" |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 17 | #include "esan_circular_buffer.h" |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 18 | #include "esan_flags.h" |
| 19 | #include "esan_shadow.h" |
Derek Bruening | 0781476 | 2016-06-03 16:14:07 +0000 | [diff] [blame] | 20 | #include "esan_sideline.h" |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 21 | #include "sanitizer_common/sanitizer_procmaps.h" |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 22 | |
| 23 | // We shadow every cache line of app memory with one shadow byte. |
| 24 | // - The highest bit of each shadow byte indicates whether the corresponding |
| 25 | // cache line has ever been accessed. |
| 26 | // - The lowest bit of each shadow byte indicates whether the corresponding |
| 27 | // cache line was accessed since the last sample. |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 28 | // - The other bits are used for working set snapshots at successively |
| 29 | // lower frequencies, each bit to the left from the lowest bit stepping |
| 30 | // down the frequency by 2 to the power of getFlags()->snapshot_step. |
| 31 | // Thus we have something like this: |
| 32 | // Bit 0: Since last sample |
| 33 | // Bit 1: Since last 2^2 samples |
| 34 | // Bit 2: Since last 2^4 samples |
| 35 | // Bit 3: ... |
| 36 | // Bit 7: Ever accessed. |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 37 | // We live with races in accessing each shadow byte. |
| 38 | typedef unsigned char byte; |
| 39 | |
| 40 | namespace __esan { |
| 41 | |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 42 | // Our shadow memory assumes that the line size is 64. |
| 43 | static const u32 CacheLineSize = 64; |
| 44 | |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 45 | // See the shadow byte layout description above. |
| 46 | static const u32 TotalWorkingSetBitIdx = 7; |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 47 | // We accumulate to the left until we hit this bit. |
| 48 | // We don't need to accumulate to the final bit as it's set on each ref |
| 49 | // by the compiler instrumentation. |
| 50 | static const u32 MaxAccumBitIdx = 6; |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 51 | static const u32 CurWorkingSetBitIdx = 0; |
| 52 | static const byte ShadowAccessedVal = |
| 53 | (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx); |
| 54 | |
Derek Bruening | 0781476 | 2016-06-03 16:14:07 +0000 | [diff] [blame] | 55 | static SidelineThread Thread; |
| 56 | // If we use real-time-based timer samples this won't overflow in any realistic |
| 57 | // scenario, but if we switch to some other unit (such as memory accesses) we |
| 58 | // may want to consider a 64-bit int. |
| 59 | static u32 SnapshotNum; |
| 60 | |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 61 | // We store the wset size for each of 8 different sampling frequencies. |
| 62 | static const u32 NumFreq = 8; // One for each bit of our shadow bytes. |
| 63 | // We cannot use static objects as the global destructor is called |
| 64 | // prior to our finalize routine. |
| 65 | // These are each circular buffers, sized up front. |
| 66 | CircularBuffer<u32> SizePerFreq[NumFreq]; |
| 67 | // We cannot rely on static initializers (they may run too late) but |
| 68 | // we record the size here for clarity: |
| 69 | u32 CircularBufferSizes[NumFreq] = { |
| 70 | // These are each mmap-ed so our minimum is one page. |
| 71 | 32*1024, |
| 72 | 16*1024, |
| 73 | 8*1024, |
| 74 | 4*1024, |
| 75 | 4*1024, |
| 76 | 4*1024, |
| 77 | 4*1024, |
| 78 | 4*1024, |
| 79 | }; |
| 80 | |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 81 | void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size, |
| 82 | bool IsWrite) { |
| 83 | if (Size == 0) |
| 84 | return; |
| 85 | SIZE_T I = 0; |
| 86 | uptr LineSize = getFlags()->cache_line_size; |
| 87 | // As Addr+Size could overflow at the top of a 32-bit address space, |
| 88 | // we avoid the simpler formula that rounds the start and end. |
| 89 | SIZE_T NumLines = Size / LineSize + |
| 90 | // Add any extra at the start or end adding on an extra line: |
| 91 | (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize; |
| 92 | byte *Shadow = (byte *)appToShadow(Addr); |
| 93 | // Write shadow bytes until we're word-aligned. |
| 94 | while (I < NumLines && (uptr)Shadow % 4 != 0) { |
| 95 | if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) |
| 96 | *Shadow |= ShadowAccessedVal; |
| 97 | ++Shadow; |
| 98 | ++I; |
| 99 | } |
| 100 | // Write whole shadow words at a time. |
| 101 | // Using a word-stride loop improves the runtime of a microbenchmark of |
| 102 | // memset calls by 10%. |
| 103 | u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 | |
| 104 | ShadowAccessedVal << 16 | ShadowAccessedVal << 24; |
| 105 | while (I + 4 <= NumLines) { |
| 106 | if ((*(u32*)Shadow & WordValue) != WordValue) |
| 107 | *(u32*)Shadow |= WordValue; |
| 108 | Shadow += 4; |
| 109 | I += 4; |
| 110 | } |
| 111 | // Write any trailing shadow bytes. |
| 112 | while (I < NumLines) { |
| 113 | if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) |
| 114 | *Shadow |= ShadowAccessedVal; |
| 115 | ++Shadow; |
| 116 | ++I; |
| 117 | } |
| 118 | } |
| 119 | |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 120 | // This routine will word-align ShadowStart and ShadowEnd prior to scanning. |
| 121 | static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart, |
| 122 | uptr ShadowEnd) { |
| 123 | u32 WorkingSetSize = 0; |
| 124 | u32 ByteValue = 0x1 << BitIdx; |
| 125 | u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 | |
| 126 | ByteValue << 24; |
| 127 | // Get word aligned start. |
| 128 | ShadowStart = RoundDownTo(ShadowStart, sizeof(u32)); |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 129 | bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx; |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 130 | for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) { |
| 131 | if ((*Ptr & WordValue) != 0) { |
| 132 | byte *BytePtr = (byte *)Ptr; |
| 133 | for (u32 j = 0; j < sizeof(u32); ++j) { |
| 134 | if (BytePtr[j] & ByteValue) { |
| 135 | ++WorkingSetSize; |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 136 | if (Accum) { |
| 137 | // Accumulate to the lower-frequency bit to the left. |
| 138 | BytePtr[j] |= (ByteValue << 1); |
| 139 | } |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 140 | } |
| 141 | } |
| 142 | // Clear this bit from every shadow byte. |
| 143 | *Ptr &= ~WordValue; |
| 144 | } |
| 145 | } |
| 146 | return WorkingSetSize; |
| 147 | } |
| 148 | |
| 149 | // Scan shadow memory to calculate the number of cache lines being accessed, |
| 150 | // i.e., the number of non-zero bits indexed by BitIdx in each shadow byte. |
| 151 | // We also clear the lowest bits (most recent working set snapshot). |
| 152 | static u32 computeWorkingSizeAndReset(u32 BitIdx) { |
| 153 | u32 WorkingSetSize = 0; |
| 154 | MemoryMappingLayout MemIter(true/*cache*/); |
| 155 | uptr Start, End, Prot; |
| 156 | while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/, |
| 157 | 0/*file size*/, &Prot)) { |
| 158 | VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", |
| 159 | __FUNCTION__, Start, End, Prot, isAppMem(Start), |
| 160 | isShadowMem(Start)); |
| 161 | if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) { |
| 162 | VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End); |
| 163 | WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End); |
| 164 | } |
| 165 | } |
| 166 | return WorkingSetSize; |
| 167 | } |
| 168 | |
Derek Bruening | 0781476 | 2016-06-03 16:14:07 +0000 | [diff] [blame] | 169 | // This is invoked from a signal handler but in a sideline thread doing nothing |
| 170 | // else so it is a little less fragile than a typical signal handler. |
| 171 | static void takeSample(void *Arg) { |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 172 | u32 BitIdx = CurWorkingSetBitIdx; |
| 173 | u32 Freq = 1; |
| 174 | ++SnapshotNum; // Simpler to skip 0 whose mod matches everything. |
| 175 | while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) { |
| 176 | u32 NumLines = computeWorkingSizeAndReset(BitIdx); |
| 177 | VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName, |
| 178 | SnapshotNum, BitIdx, Freq, NumLines); |
| 179 | SizePerFreq[BitIdx].push_back(NumLines); |
| 180 | Freq = Freq << getFlags()->snapshot_step; |
| 181 | BitIdx++; |
| 182 | } |
Derek Bruening | 0781476 | 2016-06-03 16:14:07 +0000 | [diff] [blame] | 183 | } |
| 184 | |
Derek Bruening | f6f149d | 2016-06-14 15:15:38 +0000 | [diff] [blame] | 185 | // Initialization that must be done before any instrumented code is executed. |
| 186 | void initializeShadowWorkingSet() { |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 187 | CHECK(getFlags()->cache_line_size == CacheLineSize); |
Derek Bruening | 8e74c10 | 2016-05-31 13:21:03 +0000 | [diff] [blame] | 188 | registerMemoryFaultHandler(); |
Derek Bruening | f6f149d | 2016-06-14 15:15:38 +0000 | [diff] [blame] | 189 | } |
Derek Bruening | 0781476 | 2016-06-03 16:14:07 +0000 | [diff] [blame] | 190 | |
Derek Bruening | f6f149d | 2016-06-14 15:15:38 +0000 | [diff] [blame] | 191 | void initializeWorkingSet() { |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 192 | if (getFlags()->record_snapshots) { |
| 193 | for (u32 i = 0; i < NumFreq; ++i) |
| 194 | SizePerFreq[i].initialize(CircularBufferSizes[i]); |
Derek Bruening | 0781476 | 2016-06-03 16:14:07 +0000 | [diff] [blame] | 195 | Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq); |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 196 | } |
| 197 | } |
| 198 | |
| 199 | static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) { |
| 200 | if (MilliSec > 600000) { |
| 201 | Unit = "min"; |
| 202 | return MilliSec / 60000; |
| 203 | } else if (MilliSec > 10000) { |
| 204 | Unit = "sec"; |
| 205 | return MilliSec / 1000; |
| 206 | } else { |
| 207 | Unit = "ms"; |
| 208 | return MilliSec; |
| 209 | } |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 210 | } |
| 211 | |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 212 | static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) { |
| 213 | // We need a constant to avoid software divide support: |
| 214 | static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize; |
| 215 | static const u32 MegabyteCachelines = KilobyteCachelines << 10; |
| 216 | |
| 217 | if (NumOfCachelines > 10 * MegabyteCachelines) { |
| 218 | Unit = "MB"; |
| 219 | return NumOfCachelines / MegabyteCachelines; |
| 220 | } else if (NumOfCachelines > 10 * KilobyteCachelines) { |
| 221 | Unit = "KB"; |
| 222 | return NumOfCachelines / KilobyteCachelines; |
| 223 | } else { |
| 224 | Unit = "Bytes"; |
| 225 | return NumOfCachelines * CacheLineSize; |
| 226 | } |
| 227 | } |
| 228 | |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 229 | int finalizeWorkingSet() { |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 230 | const char *Unit; |
| 231 | if (getFlags()->record_snapshots) { |
Derek Bruening | 0781476 | 2016-06-03 16:14:07 +0000 | [diff] [blame] | 232 | Thread.joinThread(); |
Derek Bruening | e78e4a6 | 2016-06-03 16:27:50 +0000 | [diff] [blame] | 233 | u32 Freq = 1; |
| 234 | Report(" Total number of samples: %u\n", SnapshotNum); |
| 235 | for (u32 i = 0; i < NumFreq; ++i) { |
| 236 | u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit); |
| 237 | Report(" Samples array #%d at period %u %s\n", i, Time, Unit); |
| 238 | // FIXME: report whether we wrapped around and thus whether we |
| 239 | // have data on the whole run or just the last N samples. |
| 240 | for (u32 j = 0; j < SizePerFreq[i].size(); ++j) { |
| 241 | u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit); |
| 242 | Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit, |
| 243 | SizePerFreq[i][j]); |
| 244 | } |
| 245 | Freq = Freq << getFlags()->snapshot_step; |
| 246 | SizePerFreq[i].free(); |
| 247 | } |
| 248 | } |
Derek Bruening | 0781476 | 2016-06-03 16:14:07 +0000 | [diff] [blame] | 249 | |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 250 | // Get the working set size for the entire execution. |
| 251 | u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx); |
Derek Bruening | 8ef3f0f | 2016-05-31 13:41:07 +0000 | [diff] [blame] | 252 | u32 Size = getSizeForPrinting(NumOfCachelines, Unit); |
| 253 | Report(" %s: the total working set size: %u %s (%u cache lines)\n", |
| 254 | SanitizerToolName, Size, Unit, NumOfCachelines); |
Derek Bruening | 8863985 | 2016-05-25 02:04:04 +0000 | [diff] [blame] | 255 | return 0; |
| 256 | } |
| 257 | |
| 258 | } // namespace __esan |