Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 1 | //===- FuzzerDFSan.cpp - DFSan-based fuzzer mutator -----------------------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // DataFlowSanitizer (DFSan) is a tool for |
| 10 | // generalised dynamic data flow (taint) analysis: |
| 11 | // http://clang.llvm.org/docs/DataFlowSanitizer.html . |
| 12 | // |
| 13 | // This file implements a mutation algorithm based on taint |
| 14 | // analysis feedback from DFSan. |
| 15 | // |
| 16 | // The approach has some similarity to "Taint-based Directed Whitebox Fuzzing" |
| 17 | // by Vijay Ganesh & Tim Leek & Martin Rinard: |
| 18 | // http://dspace.mit.edu/openaccess-disseminate/1721.1/59320, |
| 19 | // but it uses a full blown LLVM IR taint analysis and separate instrumentation |
| 20 | // to analyze all of the "attack points" at once. |
| 21 | // |
| 22 | // Workflow: |
| 23 | // * lib/Fuzzer/Fuzzer*.cpp is compiled w/o any instrumentation. |
| 24 | // * The code under test is compiled with DFSan *and* with special extra hooks |
| 25 | // that are inserted before dfsan. Currently supported hooks: |
| 26 | // - __sanitizer_cov_trace_cmp: inserted before every ICMP instruction, |
| 27 | // receives the type, size and arguments of ICMP. |
| 28 | // * Every call to HOOK(a,b) is replaced by DFSan with |
| 29 | // __dfsw_HOOK(a, b, label(a), label(b)) so that __dfsw_HOOK |
| 30 | // gets all the taint labels for the arguments. |
| 31 | // * At the Fuzzer startup we assign a unique DFSan label |
| 32 | // to every byte of the input string (Fuzzer::CurrentUnit) so that for any |
| 33 | // chunk of data we know which input bytes it has derived from. |
| 34 | // * The __dfsw_* functions (implemented in this file) record the |
| 35 | // parameters (i.e. the application data and the corresponding taint labels) |
| 36 | // in a global state. |
| 37 | // * Fuzzer::MutateWithDFSan() tries to use the data recorded by __dfsw_* |
| 38 | // hooks to guide the fuzzing towards new application states. |
| 39 | // For example if 4 bytes of data that derive from input bytes {4,5,6,7} |
| 40 | // are compared with a constant 12345 and the comparison always yields |
| 41 | // the same result, we try to insert 12345, 12344, 12346 into bytes |
| 42 | // {4,5,6,7} of the next fuzzed inputs. |
| 43 | // |
| 44 | // This code does not function when DFSan is not linked in. |
| 45 | // Instead of using ifdefs and thus requiring a separate build of lib/Fuzzer |
| 46 | // we redeclare the dfsan_* interface functions as weak and check if they |
| 47 | // are nullptr before calling. |
| 48 | // If this approach proves to be useful we may add attribute(weak) to the |
| 49 | // dfsan declarations in dfsan_interface.h |
| 50 | // |
| 51 | // This module is in the "proof of concept" stage. |
| 52 | // It is capable of solving only the simplest puzzles |
| 53 | // like test/dfsan/DFSanSimpleCmpTest.cpp. |
| 54 | //===----------------------------------------------------------------------===// |
| 55 | |
| 56 | /* Example of manual usage: |
| 57 | ( |
| 58 | cd $LLVM/lib/Fuzzer/ |
| 59 | clang -fPIC -c -g -O2 -std=c++11 Fuzzer*.cpp |
Alexey Samsonov | 21a3381 | 2015-05-07 23:33:24 +0000 | [diff] [blame^] | 60 | clang++ -O0 -std=c++11 -fsanitize-coverage=edge,trace-cmp \ |
Kostya Serebryany | 3befe94 | 2015-05-06 22:47:24 +0000 | [diff] [blame] | 61 | -fsanitize=dataflow \ |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 62 | test/dfsan/DFSanSimpleCmpTest.cpp Fuzzer*.o |
| 63 | ./a.out |
| 64 | ) |
| 65 | */ |
| 66 | |
| 67 | #include "FuzzerInternal.h" |
| 68 | #include <sanitizer/dfsan_interface.h> |
| 69 | |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 70 | #include <algorithm> |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 71 | #include <cstring> |
| 72 | #include <iostream> |
| 73 | #include <unordered_map> |
| 74 | |
| 75 | extern "C" { |
| 76 | __attribute__((weak)) |
| 77 | dfsan_label dfsan_create_label(const char *desc, void *userdata); |
| 78 | __attribute__((weak)) |
| 79 | void dfsan_set_label(dfsan_label label, void *addr, size_t size); |
| 80 | __attribute__((weak)) |
| 81 | void dfsan_add_label(dfsan_label label, void *addr, size_t size); |
| 82 | __attribute__((weak)) |
| 83 | const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label); |
Kostya Serebryany | a407dde | 2015-05-07 00:11:33 +0000 | [diff] [blame] | 84 | __attribute__((weak)) |
| 85 | dfsan_label dfsan_read_label(const void *addr, size_t size); |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 86 | } // extern "C" |
| 87 | |
| 88 | namespace { |
| 89 | |
| 90 | // These values are copied from include/llvm/IR/InstrTypes.h. |
| 91 | // We do not include the LLVM headers here to remain independent. |
| 92 | // If these values ever change, an assertion in ComputeCmp will fail. |
| 93 | enum Predicate { |
| 94 | ICMP_EQ = 32, ///< equal |
| 95 | ICMP_NE = 33, ///< not equal |
| 96 | ICMP_UGT = 34, ///< unsigned greater than |
| 97 | ICMP_UGE = 35, ///< unsigned greater or equal |
| 98 | ICMP_ULT = 36, ///< unsigned less than |
| 99 | ICMP_ULE = 37, ///< unsigned less or equal |
| 100 | ICMP_SGT = 38, ///< signed greater than |
| 101 | ICMP_SGE = 39, ///< signed greater or equal |
| 102 | ICMP_SLT = 40, ///< signed less than |
| 103 | ICMP_SLE = 41, ///< signed less or equal |
| 104 | }; |
| 105 | |
| 106 | template <class U, class S> |
| 107 | bool ComputeCmp(size_t CmpType, U Arg1, U Arg2) { |
| 108 | switch(CmpType) { |
| 109 | case ICMP_EQ : return Arg1 == Arg2; |
| 110 | case ICMP_NE : return Arg1 != Arg2; |
| 111 | case ICMP_UGT: return Arg1 > Arg2; |
| 112 | case ICMP_UGE: return Arg1 >= Arg2; |
| 113 | case ICMP_ULT: return Arg1 < Arg2; |
| 114 | case ICMP_ULE: return Arg1 <= Arg2; |
| 115 | case ICMP_SGT: return (S)Arg1 > (S)Arg2; |
| 116 | case ICMP_SGE: return (S)Arg1 >= (S)Arg2; |
| 117 | case ICMP_SLT: return (S)Arg1 < (S)Arg2; |
| 118 | case ICMP_SLE: return (S)Arg1 <= (S)Arg2; |
| 119 | default: assert(0 && "unsupported CmpType"); |
| 120 | } |
| 121 | return false; |
| 122 | } |
| 123 | |
| 124 | static bool ComputeCmp(size_t CmpSize, size_t CmpType, uint64_t Arg1, |
| 125 | uint64_t Arg2) { |
| 126 | if (CmpSize == 8) return ComputeCmp<uint64_t, int64_t>(CmpType, Arg1, Arg2); |
| 127 | if (CmpSize == 4) return ComputeCmp<uint32_t, int32_t>(CmpType, Arg1, Arg2); |
| 128 | if (CmpSize == 2) return ComputeCmp<uint16_t, int16_t>(CmpType, Arg1, Arg2); |
| 129 | if (CmpSize == 1) return ComputeCmp<uint8_t, int8_t>(CmpType, Arg1, Arg2); |
| 130 | assert(0 && "unsupported type size"); |
| 131 | return true; |
| 132 | } |
| 133 | |
| 134 | // As a simplification we use the range of input bytes instead of a set of input |
| 135 | // bytes. |
| 136 | struct LabelRange { |
| 137 | uint16_t Beg, End; // Range is [Beg, End), thus Beg==End is an empty range. |
| 138 | |
| 139 | LabelRange(uint16_t Beg = 0, uint16_t End = 0) : Beg(Beg), End(End) {} |
| 140 | |
| 141 | static LabelRange Join(LabelRange LR1, LabelRange LR2) { |
| 142 | if (LR1.Beg == LR1.End) return LR2; |
| 143 | if (LR2.Beg == LR2.End) return LR1; |
| 144 | return {std::min(LR1.Beg, LR2.Beg), std::max(LR1.End, LR2.End)}; |
| 145 | } |
| 146 | LabelRange &Join(LabelRange LR) { |
| 147 | return *this = Join(*this, LR); |
| 148 | } |
| 149 | static LabelRange Singleton(const dfsan_label_info *LI) { |
| 150 | uint16_t Idx = (uint16_t)(uintptr_t)LI->userdata; |
| 151 | assert(Idx > 0); |
| 152 | return {(uint16_t)(Idx - 1), Idx}; |
| 153 | } |
| 154 | }; |
| 155 | |
| 156 | std::ostream &operator<<(std::ostream &os, const LabelRange &LR) { |
| 157 | return os << "[" << LR.Beg << "," << LR.End << ")"; |
| 158 | } |
| 159 | |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 160 | // For now, very simple: put Size bytes of Data at position Pos. |
| 161 | struct TraceBasedMutation { |
| 162 | size_t Pos; |
| 163 | size_t Size; |
| 164 | uint64_t Data; |
| 165 | }; |
| 166 | |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 167 | class DFSanState { |
| 168 | public: |
| 169 | DFSanState(const fuzzer::Fuzzer::FuzzingOptions &Options) |
| 170 | : Options(Options) {} |
| 171 | |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 172 | LabelRange GetLabelRange(dfsan_label L); |
| 173 | void DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType, |
| 174 | uint64_t Arg1, uint64_t Arg2, dfsan_label L1, |
| 175 | dfsan_label L2); |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 176 | |
| 177 | void StartTraceRecording() { |
| 178 | RecordingTraces = true; |
| 179 | Mutations.clear(); |
| 180 | } |
| 181 | |
| 182 | size_t StopTraceRecording() { |
| 183 | RecordingTraces = false; |
| 184 | std::random_shuffle(Mutations.begin(), Mutations.end()); |
| 185 | return Mutations.size(); |
| 186 | } |
| 187 | |
| 188 | void ApplyTraceBasedMutation(size_t Idx, fuzzer::Unit *U); |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 189 | |
| 190 | private: |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 191 | bool RecordingTraces = false; |
| 192 | std::vector<TraceBasedMutation> Mutations; |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 193 | LabelRange LabelRanges[1 << (sizeof(dfsan_label) * 8)] = {}; |
| 194 | const fuzzer::Fuzzer::FuzzingOptions &Options; |
| 195 | }; |
| 196 | |
| 197 | LabelRange DFSanState::GetLabelRange(dfsan_label L) { |
| 198 | LabelRange &LR = LabelRanges[L]; |
| 199 | if (LR.Beg < LR.End || L == 0) |
| 200 | return LR; |
| 201 | const dfsan_label_info *LI = dfsan_get_label_info(L); |
| 202 | if (LI->l1 || LI->l2) |
| 203 | return LR = LabelRange::Join(GetLabelRange(LI->l1), GetLabelRange(LI->l2)); |
| 204 | return LR = LabelRange::Singleton(LI); |
| 205 | } |
| 206 | |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 207 | void DFSanState::ApplyTraceBasedMutation(size_t Idx, fuzzer::Unit *U) { |
| 208 | assert(Idx < Mutations.size()); |
| 209 | auto &M = Mutations[Idx]; |
| 210 | if (Options.Verbosity >= 3) |
| 211 | std::cerr << "TBM " << M.Pos << " " << M.Size << " " << M.Data << "\n"; |
| 212 | if (M.Pos + M.Size > U->size()) return; |
| 213 | memcpy(U->data() + M.Pos, &M.Data, M.Size); |
| 214 | } |
| 215 | |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 216 | void DFSanState::DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType, |
| 217 | uint64_t Arg1, uint64_t Arg2, dfsan_label L1, |
| 218 | dfsan_label L2) { |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 219 | if (!RecordingTraces) return; |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 220 | if (L1 == 0 && L2 == 0) |
| 221 | return; // Not actionable. |
| 222 | if (L1 != 0 && L2 != 0) |
| 223 | return; // Probably still actionable. |
| 224 | bool Res = ComputeCmp(CmpSize, CmpType, Arg1, Arg2); |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 225 | uint64_t Data = L1 ? Arg2 : Arg1; |
| 226 | LabelRange LR = L1 ? GetLabelRange(L1) : GetLabelRange(L2); |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 227 | |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 228 | for (size_t Pos = LR.Beg; Pos + CmpSize <= LR.End; Pos++) { |
| 229 | Mutations.push_back({Pos, CmpSize, Data}); |
| 230 | Mutations.push_back({Pos, CmpSize, Data + 1}); |
| 231 | Mutations.push_back({Pos, CmpSize, Data - 1}); |
| 232 | } |
| 233 | |
| 234 | if (CmpSize > LR.End - LR.Beg) |
| 235 | Mutations.push_back({LR.Beg, (unsigned)(LR.End - LR.Beg), Data}); |
| 236 | |
| 237 | |
| 238 | if (Options.Verbosity >= 3) |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 239 | std::cerr << "DFSAN:" |
| 240 | << " PC " << std::hex << PC << std::dec |
| 241 | << " S " << CmpSize |
| 242 | << " T " << CmpType |
| 243 | << " A1 " << Arg1 << " A2 " << Arg2 << " R " << Res |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 244 | << " L" << L1 |
| 245 | << " L" << L2 |
| 246 | << " R" << LR |
| 247 | << " MU " << Mutations.size() |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 248 | << "\n"; |
| 249 | } |
| 250 | |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 251 | static DFSanState *DFSan; |
| 252 | |
| 253 | } // namespace |
| 254 | |
| 255 | namespace fuzzer { |
| 256 | |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 257 | void Fuzzer::StartTraceRecording() { |
| 258 | if (!DFSan) return; |
| 259 | DFSan->StartTraceRecording(); |
| 260 | } |
| 261 | |
| 262 | size_t Fuzzer::StopTraceRecording() { |
| 263 | if (!DFSan) return 0; |
| 264 | return DFSan->StopTraceRecording(); |
| 265 | } |
| 266 | |
| 267 | void Fuzzer::ApplyTraceBasedMutation(size_t Idx, Unit *U) { |
| 268 | assert(DFSan); |
| 269 | DFSan->ApplyTraceBasedMutation(Idx, U); |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 270 | } |
| 271 | |
| 272 | void Fuzzer::InitializeDFSan() { |
| 273 | if (!&dfsan_create_label || !Options.UseDFSan) return; |
| 274 | DFSan = new DFSanState(Options); |
| 275 | CurrentUnit.resize(Options.MaxLen); |
| 276 | for (size_t i = 0; i < static_cast<size_t>(Options.MaxLen); i++) { |
| 277 | dfsan_label L = dfsan_create_label("input", (void*)(i + 1)); |
| 278 | // We assume that no one else has called dfsan_create_label before. |
| 279 | assert(L == i + 1); |
| 280 | dfsan_set_label(L, &CurrentUnit[i], 1); |
| 281 | } |
| 282 | } |
| 283 | |
| 284 | } // namespace fuzzer |
| 285 | |
| 286 | extern "C" { |
| 287 | void __dfsw___sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1, |
| 288 | uint64_t Arg2, dfsan_label L0, |
| 289 | dfsan_label L1, dfsan_label L2) { |
| 290 | assert(L0 == 0); |
| 291 | uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0)); |
| 292 | uint64_t CmpSize = (SizeAndType >> 32) / 8; |
| 293 | uint64_t Type = (SizeAndType << 32) >> 32; |
| 294 | DFSan->DFSanCmpCallback(PC, CmpSize, Type, Arg1, Arg2, L1, L2); |
| 295 | } |
Kostya Serebryany | a407dde | 2015-05-07 00:11:33 +0000 | [diff] [blame] | 296 | |
| 297 | void dfsan_weak_hook_memcmp(void *caller_pc, const void *s1, const void *s2, |
| 298 | size_t n, dfsan_label s1_label, |
| 299 | dfsan_label s2_label, dfsan_label n_label) { |
| 300 | uintptr_t PC = reinterpret_cast<uintptr_t>(caller_pc); |
Kostya Serebryany | beb24c3 | 2015-05-07 21:02:11 +0000 | [diff] [blame] | 301 | uint64_t S1 = 0, S2 = 0; |
Kostya Serebryany | a407dde | 2015-05-07 00:11:33 +0000 | [diff] [blame] | 302 | // Simplification: handle only first 8 bytes. |
| 303 | memcpy(&S1, s1, std::min(n, sizeof(S1))); |
| 304 | memcpy(&S2, s2, std::min(n, sizeof(S2))); |
| 305 | dfsan_label L1 = dfsan_read_label(s1, n); |
| 306 | dfsan_label L2 = dfsan_read_label(s2, n); |
| 307 | DFSan->DFSanCmpCallback(PC, n, ICMP_EQ, S1, S2, L1, L2); |
| 308 | } |
Kostya Serebryany | 7d470cf | 2015-05-07 18:32:29 +0000 | [diff] [blame] | 309 | |
| 310 | void __sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1, |
| 311 | uint64_t Arg2) { |
| 312 | // This symbol will be present if dfsan is disabled on the given function. |
| 313 | // FIXME: implement poor man's taint analysis here (w/o dfsan). |
| 314 | } |
| 315 | |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 316 | } // extern "C" |