Kostya Serebryany | 980e45f | 2018-05-31 01:27:07 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | #===- lib/fuzzer/scripts/collect_data_flow.py ------------------------------===# |
| 3 | # |
| 4 | # The LLVM Compiler Infrastructure |
| 5 | # |
| 6 | # This file is distributed under the University of Illinois Open Source |
| 7 | # License. See LICENSE.TXT for details. |
| 8 | # |
| 9 | #===------------------------------------------------------------------------===# |
| 10 | # Runs the data-flow tracer several times on the same input in order to collect |
| 11 | # the complete trace for all input bytes (running it on all bytes at once |
| 12 | # may fail if DFSan runs out of labels). |
| 13 | # Usage: |
Kostya Serebryany | 1fd005f | 2018-06-06 01:23:29 +0000 | [diff] [blame] | 14 | # |
| 15 | # # Collect dataflow for one input, store it in OUTPUT (default is stdout) |
| 16 | # collect_data_flow.py BINARY INPUT [OUTPUT] |
| 17 | # |
| 18 | # # Collect dataflow for all inputs in CORPUS_DIR, store them in OUTPUT_DIR |
| 19 | # collect_data_flow.py BINARY CORPUS_DIR OUTPUT_DIR |
Kostya Serebryany | 980e45f | 2018-05-31 01:27:07 +0000 | [diff] [blame] | 20 | #===------------------------------------------------------------------------===# |
| 21 | import atexit |
Kostya Serebryany | 1fd005f | 2018-06-06 01:23:29 +0000 | [diff] [blame] | 22 | import hashlib |
Kostya Serebryany | 980e45f | 2018-05-31 01:27:07 +0000 | [diff] [blame] | 23 | import sys |
| 24 | import os |
| 25 | import subprocess |
| 26 | import tempfile |
| 27 | import shutil |
| 28 | |
| 29 | tmpdir = "" |
| 30 | |
| 31 | def cleanup(d): |
Vitaly Buka | 4d6b33c | 2018-06-17 09:11:56 +0000 | [diff] [blame] | 32 | print("removing: %s" % d) |
Kostya Serebryany | 980e45f | 2018-05-31 01:27:07 +0000 | [diff] [blame] | 33 | shutil.rmtree(d) |
| 34 | |
Kostya Serebryany | 1fd005f | 2018-06-06 01:23:29 +0000 | [diff] [blame] | 35 | def collect_dataflow_for_corpus(self, exe, corpus_dir, output_dir): |
Vitaly Buka | 4d6b33c | 2018-06-17 09:11:56 +0000 | [diff] [blame] | 36 | print("Collecting dataflow for corpus: %s output_dir: %s" % (corpus_dir, |
| 37 | output_dir)) |
Kostya Serebryany | 1fd005f | 2018-06-06 01:23:29 +0000 | [diff] [blame] | 38 | assert not os.path.exists(output_dir) |
| 39 | os.mkdir(output_dir) |
| 40 | for root, dirs, files in os.walk(corpus_dir): |
| 41 | for f in files: |
| 42 | path = os.path.join(root, f) |
| 43 | sha1 = hashlib.sha1(open(path).read()).hexdigest() |
| 44 | output = os.path.join(output_dir, sha1) |
| 45 | subprocess.call([self, exe, path, output]) |
| 46 | functions_txt = open(os.path.join(output_dir, "functions.txt"), "w") |
| 47 | subprocess.call([exe], stdout=functions_txt) |
| 48 | |
| 49 | |
Kostya Serebryany | 980e45f | 2018-05-31 01:27:07 +0000 | [diff] [blame] | 50 | def main(argv): |
| 51 | exe = argv[1] |
| 52 | inp = argv[2] |
Kostya Serebryany | 1fd005f | 2018-06-06 01:23:29 +0000 | [diff] [blame] | 53 | if os.path.isdir(inp): |
| 54 | return collect_dataflow_for_corpus(argv[0], exe, inp, argv[3]) |
Kostya Serebryany | 980e45f | 2018-05-31 01:27:07 +0000 | [diff] [blame] | 55 | size = os.path.getsize(inp) |
| 56 | q = [[0, size]] |
| 57 | tmpdir = tempfile.mkdtemp(prefix="libfuzzer-tmp-") |
| 58 | atexit.register(cleanup, tmpdir) |
| 59 | print "tmpdir: ", tmpdir |
| 60 | outputs = [] |
| 61 | while len(q): |
| 62 | r = q.pop() |
| 63 | print "******* Trying: ", r |
| 64 | tmpfile = os.path.join(tmpdir, str(r[0]) + "-" + str(r[1])) |
| 65 | ret = subprocess.call([exe, str(r[0]), str(r[1]), inp, tmpfile]) |
| 66 | if ret and r[1] - r[0] >= 2: |
| 67 | q.append([r[0], (r[1] + r[0]) / 2]) |
| 68 | q.append([(r[1] + r[0]) / 2, r[1]]) |
| 69 | else: |
| 70 | outputs.append(tmpfile) |
| 71 | print "******* Success: ", r |
| 72 | f = sys.stdout |
| 73 | if len(argv) >= 4: |
| 74 | f = open(argv[3], "w") |
| 75 | merge = os.path.join(os.path.dirname(argv[0]), "merge_data_flow.py") |
| 76 | subprocess.call([merge] + outputs, stdout=f) |
| 77 | |
| 78 | if __name__ == '__main__': |
| 79 | main(sys.argv) |