Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 1 | //===- FuzzerDFSan.cpp - DFSan-based fuzzer mutator -----------------------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // DataFlowSanitizer (DFSan) is a tool for |
| 10 | // generalised dynamic data flow (taint) analysis: |
| 11 | // http://clang.llvm.org/docs/DataFlowSanitizer.html . |
| 12 | // |
| 13 | // This file implements a mutation algorithm based on taint |
| 14 | // analysis feedback from DFSan. |
| 15 | // |
| 16 | // The approach has some similarity to "Taint-based Directed Whitebox Fuzzing" |
| 17 | // by Vijay Ganesh & Tim Leek & Martin Rinard: |
| 18 | // http://dspace.mit.edu/openaccess-disseminate/1721.1/59320, |
| 19 | // but it uses a full blown LLVM IR taint analysis and separate instrumentation |
| 20 | // to analyze all of the "attack points" at once. |
| 21 | // |
| 22 | // Workflow: |
| 23 | // * lib/Fuzzer/Fuzzer*.cpp is compiled w/o any instrumentation. |
| 24 | // * The code under test is compiled with DFSan *and* with special extra hooks |
| 25 | // that are inserted before dfsan. Currently supported hooks: |
| 26 | // - __sanitizer_cov_trace_cmp: inserted before every ICMP instruction, |
| 27 | // receives the type, size and arguments of ICMP. |
| 28 | // * Every call to HOOK(a,b) is replaced by DFSan with |
| 29 | // __dfsw_HOOK(a, b, label(a), label(b)) so that __dfsw_HOOK |
| 30 | // gets all the taint labels for the arguments. |
| 31 | // * At the Fuzzer startup we assign a unique DFSan label |
| 32 | // to every byte of the input string (Fuzzer::CurrentUnit) so that for any |
| 33 | // chunk of data we know which input bytes it has derived from. |
| 34 | // * The __dfsw_* functions (implemented in this file) record the |
| 35 | // parameters (i.e. the application data and the corresponding taint labels) |
| 36 | // in a global state. |
| 37 | // * Fuzzer::MutateWithDFSan() tries to use the data recorded by __dfsw_* |
| 38 | // hooks to guide the fuzzing towards new application states. |
| 39 | // For example if 4 bytes of data that derive from input bytes {4,5,6,7} |
| 40 | // are compared with a constant 12345 and the comparison always yields |
| 41 | // the same result, we try to insert 12345, 12344, 12346 into bytes |
| 42 | // {4,5,6,7} of the next fuzzed inputs. |
| 43 | // |
| 44 | // This code does not function when DFSan is not linked in. |
| 45 | // Instead of using ifdefs and thus requiring a separate build of lib/Fuzzer |
| 46 | // we redeclare the dfsan_* interface functions as weak and check if they |
| 47 | // are nullptr before calling. |
| 48 | // If this approach proves to be useful we may add attribute(weak) to the |
| 49 | // dfsan declarations in dfsan_interface.h |
| 50 | // |
| 51 | // This module is in the "proof of concept" stage. |
| 52 | // It is capable of solving only the simplest puzzles |
| 53 | // like test/dfsan/DFSanSimpleCmpTest.cpp. |
| 54 | //===----------------------------------------------------------------------===// |
| 55 | |
| 56 | /* Example of manual usage: |
| 57 | ( |
| 58 | cd $LLVM/lib/Fuzzer/ |
| 59 | clang -fPIC -c -g -O2 -std=c++11 Fuzzer*.cpp |
| 60 | clang++ -O0 -std=c++11 -fsanitize-coverage=3 \ |
| 61 | -mllvm -sanitizer-coverage-experimental-trace-compares=1 \ |
Kostya Serebryany | 3befe94 | 2015-05-06 22:47:24 +0000 | [diff] [blame^] | 62 | -fsanitize=dataflow \ |
Kostya Serebryany | 16d03bd | 2015-03-30 22:09:51 +0000 | [diff] [blame] | 63 | test/dfsan/DFSanSimpleCmpTest.cpp Fuzzer*.o |
| 64 | ./a.out |
| 65 | ) |
| 66 | */ |
| 67 | |
| 68 | #include "FuzzerInternal.h" |
| 69 | #include <sanitizer/dfsan_interface.h> |
| 70 | |
| 71 | #include <cstring> |
| 72 | #include <iostream> |
| 73 | #include <unordered_map> |
| 74 | |
| 75 | extern "C" { |
| 76 | __attribute__((weak)) |
| 77 | dfsan_label dfsan_create_label(const char *desc, void *userdata); |
| 78 | __attribute__((weak)) |
| 79 | void dfsan_set_label(dfsan_label label, void *addr, size_t size); |
| 80 | __attribute__((weak)) |
| 81 | void dfsan_add_label(dfsan_label label, void *addr, size_t size); |
| 82 | __attribute__((weak)) |
| 83 | const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label); |
| 84 | } // extern "C" |
| 85 | |
| 86 | namespace { |
| 87 | |
| 88 | // These values are copied from include/llvm/IR/InstrTypes.h. |
| 89 | // We do not include the LLVM headers here to remain independent. |
| 90 | // If these values ever change, an assertion in ComputeCmp will fail. |
| 91 | enum Predicate { |
| 92 | ICMP_EQ = 32, ///< equal |
| 93 | ICMP_NE = 33, ///< not equal |
| 94 | ICMP_UGT = 34, ///< unsigned greater than |
| 95 | ICMP_UGE = 35, ///< unsigned greater or equal |
| 96 | ICMP_ULT = 36, ///< unsigned less than |
| 97 | ICMP_ULE = 37, ///< unsigned less or equal |
| 98 | ICMP_SGT = 38, ///< signed greater than |
| 99 | ICMP_SGE = 39, ///< signed greater or equal |
| 100 | ICMP_SLT = 40, ///< signed less than |
| 101 | ICMP_SLE = 41, ///< signed less or equal |
| 102 | }; |
| 103 | |
| 104 | template <class U, class S> |
| 105 | bool ComputeCmp(size_t CmpType, U Arg1, U Arg2) { |
| 106 | switch(CmpType) { |
| 107 | case ICMP_EQ : return Arg1 == Arg2; |
| 108 | case ICMP_NE : return Arg1 != Arg2; |
| 109 | case ICMP_UGT: return Arg1 > Arg2; |
| 110 | case ICMP_UGE: return Arg1 >= Arg2; |
| 111 | case ICMP_ULT: return Arg1 < Arg2; |
| 112 | case ICMP_ULE: return Arg1 <= Arg2; |
| 113 | case ICMP_SGT: return (S)Arg1 > (S)Arg2; |
| 114 | case ICMP_SGE: return (S)Arg1 >= (S)Arg2; |
| 115 | case ICMP_SLT: return (S)Arg1 < (S)Arg2; |
| 116 | case ICMP_SLE: return (S)Arg1 <= (S)Arg2; |
| 117 | default: assert(0 && "unsupported CmpType"); |
| 118 | } |
| 119 | return false; |
| 120 | } |
| 121 | |
| 122 | static bool ComputeCmp(size_t CmpSize, size_t CmpType, uint64_t Arg1, |
| 123 | uint64_t Arg2) { |
| 124 | if (CmpSize == 8) return ComputeCmp<uint64_t, int64_t>(CmpType, Arg1, Arg2); |
| 125 | if (CmpSize == 4) return ComputeCmp<uint32_t, int32_t>(CmpType, Arg1, Arg2); |
| 126 | if (CmpSize == 2) return ComputeCmp<uint16_t, int16_t>(CmpType, Arg1, Arg2); |
| 127 | if (CmpSize == 1) return ComputeCmp<uint8_t, int8_t>(CmpType, Arg1, Arg2); |
| 128 | assert(0 && "unsupported type size"); |
| 129 | return true; |
| 130 | } |
| 131 | |
| 132 | // As a simplification we use the range of input bytes instead of a set of input |
| 133 | // bytes. |
| 134 | struct LabelRange { |
| 135 | uint16_t Beg, End; // Range is [Beg, End), thus Beg==End is an empty range. |
| 136 | |
| 137 | LabelRange(uint16_t Beg = 0, uint16_t End = 0) : Beg(Beg), End(End) {} |
| 138 | |
| 139 | static LabelRange Join(LabelRange LR1, LabelRange LR2) { |
| 140 | if (LR1.Beg == LR1.End) return LR2; |
| 141 | if (LR2.Beg == LR2.End) return LR1; |
| 142 | return {std::min(LR1.Beg, LR2.Beg), std::max(LR1.End, LR2.End)}; |
| 143 | } |
| 144 | LabelRange &Join(LabelRange LR) { |
| 145 | return *this = Join(*this, LR); |
| 146 | } |
| 147 | static LabelRange Singleton(const dfsan_label_info *LI) { |
| 148 | uint16_t Idx = (uint16_t)(uintptr_t)LI->userdata; |
| 149 | assert(Idx > 0); |
| 150 | return {(uint16_t)(Idx - 1), Idx}; |
| 151 | } |
| 152 | }; |
| 153 | |
| 154 | std::ostream &operator<<(std::ostream &os, const LabelRange &LR) { |
| 155 | return os << "[" << LR.Beg << "," << LR.End << ")"; |
| 156 | } |
| 157 | |
| 158 | class DFSanState { |
| 159 | public: |
| 160 | DFSanState(const fuzzer::Fuzzer::FuzzingOptions &Options) |
| 161 | : Options(Options) {} |
| 162 | |
| 163 | struct CmpSiteInfo { |
| 164 | size_t ResCounters[2] = {0, 0}; |
| 165 | size_t CmpSize = 0; |
| 166 | LabelRange LR; |
| 167 | std::unordered_map<uint64_t, size_t> CountedConstants; |
| 168 | }; |
| 169 | |
| 170 | LabelRange GetLabelRange(dfsan_label L); |
| 171 | void DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType, |
| 172 | uint64_t Arg1, uint64_t Arg2, dfsan_label L1, |
| 173 | dfsan_label L2); |
| 174 | bool Mutate(fuzzer::Unit *U); |
| 175 | |
| 176 | private: |
| 177 | std::unordered_map<uintptr_t, CmpSiteInfo> PcToCmpSiteInfoMap; |
| 178 | LabelRange LabelRanges[1 << (sizeof(dfsan_label) * 8)] = {}; |
| 179 | const fuzzer::Fuzzer::FuzzingOptions &Options; |
| 180 | }; |
| 181 | |
| 182 | LabelRange DFSanState::GetLabelRange(dfsan_label L) { |
| 183 | LabelRange &LR = LabelRanges[L]; |
| 184 | if (LR.Beg < LR.End || L == 0) |
| 185 | return LR; |
| 186 | const dfsan_label_info *LI = dfsan_get_label_info(L); |
| 187 | if (LI->l1 || LI->l2) |
| 188 | return LR = LabelRange::Join(GetLabelRange(LI->l1), GetLabelRange(LI->l2)); |
| 189 | return LR = LabelRange::Singleton(LI); |
| 190 | } |
| 191 | |
| 192 | void DFSanState::DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType, |
| 193 | uint64_t Arg1, uint64_t Arg2, dfsan_label L1, |
| 194 | dfsan_label L2) { |
| 195 | if (L1 == 0 && L2 == 0) |
| 196 | return; // Not actionable. |
| 197 | if (L1 != 0 && L2 != 0) |
| 198 | return; // Probably still actionable. |
| 199 | bool Res = ComputeCmp(CmpSize, CmpType, Arg1, Arg2); |
| 200 | CmpSiteInfo &CSI = PcToCmpSiteInfoMap[PC]; |
| 201 | CSI.CmpSize = CmpSize; |
| 202 | CSI.LR.Join(GetLabelRange(L1)).Join(GetLabelRange(L2)); |
| 203 | if (!L1) CSI.CountedConstants[Arg1]++; |
| 204 | if (!L2) CSI.CountedConstants[Arg2]++; |
| 205 | size_t Counter = CSI.ResCounters[Res]++; |
| 206 | |
| 207 | if (Options.Verbosity >= 2 && |
| 208 | (Counter & (Counter - 1)) == 0 && |
| 209 | CSI.ResCounters[!Res] == 0) |
| 210 | std::cerr << "DFSAN:" |
| 211 | << " PC " << std::hex << PC << std::dec |
| 212 | << " S " << CmpSize |
| 213 | << " T " << CmpType |
| 214 | << " A1 " << Arg1 << " A2 " << Arg2 << " R " << Res |
| 215 | << " L" << L1 << GetLabelRange(L1) |
| 216 | << " L" << L2 << GetLabelRange(L2) |
| 217 | << " LR " << CSI.LR |
| 218 | << "\n"; |
| 219 | } |
| 220 | |
| 221 | bool DFSanState::Mutate(fuzzer::Unit *U) { |
| 222 | for (auto &PCToCmp : PcToCmpSiteInfoMap) { |
| 223 | auto &CSI = PCToCmp.second; |
| 224 | if (CSI.ResCounters[0] * CSI.ResCounters[1] != 0) continue; |
| 225 | if (CSI.ResCounters[0] + CSI.ResCounters[1] < 1000) continue; |
| 226 | if (CSI.CountedConstants.size() != 1) continue; |
| 227 | uintptr_t C = CSI.CountedConstants.begin()->first; |
| 228 | if (U->size() >= CSI.CmpSize) { |
| 229 | size_t RangeSize = CSI.LR.End - CSI.LR.Beg; |
| 230 | size_t Idx = CSI.LR.Beg + rand() % RangeSize; |
| 231 | if (Idx + CSI.CmpSize > U->size()) continue; |
| 232 | C += rand() % 5 - 2; |
| 233 | memcpy(U->data() + Idx, &C, CSI.CmpSize); |
| 234 | return true; |
| 235 | } |
| 236 | } |
| 237 | return false; |
| 238 | } |
| 239 | |
| 240 | static DFSanState *DFSan; |
| 241 | |
| 242 | } // namespace |
| 243 | |
| 244 | namespace fuzzer { |
| 245 | |
| 246 | bool Fuzzer::MutateWithDFSan(Unit *U) { |
| 247 | if (!&dfsan_create_label || !DFSan) return false; |
| 248 | return DFSan->Mutate(U); |
| 249 | } |
| 250 | |
| 251 | void Fuzzer::InitializeDFSan() { |
| 252 | if (!&dfsan_create_label || !Options.UseDFSan) return; |
| 253 | DFSan = new DFSanState(Options); |
| 254 | CurrentUnit.resize(Options.MaxLen); |
| 255 | for (size_t i = 0; i < static_cast<size_t>(Options.MaxLen); i++) { |
| 256 | dfsan_label L = dfsan_create_label("input", (void*)(i + 1)); |
| 257 | // We assume that no one else has called dfsan_create_label before. |
| 258 | assert(L == i + 1); |
| 259 | dfsan_set_label(L, &CurrentUnit[i], 1); |
| 260 | } |
| 261 | } |
| 262 | |
| 263 | } // namespace fuzzer |
| 264 | |
| 265 | extern "C" { |
| 266 | void __dfsw___sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1, |
| 267 | uint64_t Arg2, dfsan_label L0, |
| 268 | dfsan_label L1, dfsan_label L2) { |
| 269 | assert(L0 == 0); |
| 270 | uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0)); |
| 271 | uint64_t CmpSize = (SizeAndType >> 32) / 8; |
| 272 | uint64_t Type = (SizeAndType << 32) >> 32; |
| 273 | DFSan->DFSanCmpCallback(PC, CmpSize, Type, Arg1, Arg2, L1, L2); |
| 274 | } |
| 275 | } // extern "C" |