tools/kilobench/kilobench.cpp - platform/external/skia - Gitiles

 /*
  * Copyright 2016 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #include "GrCaps.h"
 #include "GrContextFactory.h"
 #include "Benchmark.h"
 #include "ResultsWriter.h"
 #include "SkCommandLineFlags.h"
 #include "SkOSFile.h"
 #include "SkStream.h"
 #include "SkSurface.h"
 #include "SkTime.h"
 #include "SkTLList.h"
 #include "SkThreadUtils.h"
 #include "Stats.h"
 #include "Timer.h"
 #include "VisualSKPBench.h"
 #include "gl/GrGLDefines.h"
 #include "../private/SkMutex.h"
 #include "../private/SkSemaphore.h"
 #include "../private/SkGpuFenceSync.h"

 // posix only for now
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/wait.h>

 /*
  * This is an experimental GPU only benchmarking program.  The initial implementation will only
  * support SKPs.
  */

 // To get image decoders linked in we have to do the below magic
 #include "SkForceLinking.h"
 #include "SkImageDecoder.h"
 __SK_FORCE_IMAGE_DECODER_LINKING;

 static const int kAutoTuneLoops = 0;

 static const int kDefaultLoops =
 #ifdef SK_DEBUG
     1;
 #else
     kAutoTuneLoops;
 #endif

 static SkString loops_help_txt() {
     SkString help;
     help.printf("Number of times to run each bench. Set this to %d to auto-"
                 "tune for each bench. Timings are only reported when auto-tuning.",
                 kAutoTuneLoops);
     return help;
 }

 DEFINE_string(skps, "skps", "Directory to read skps from.");
 DEFINE_string2(match, m, nullptr,
                "[~][^]substring[$] [...] of GM name to run.\n"
                "Multiple matches may be separated by spaces.\n"
                "~ causes a matching bench to always be skipped\n"
                "^ requires the start of the bench to match\n"
                "$ requires the end of the bench to match\n"
                "^ and $ requires an exact match\n"
                "If a bench does not match any list entry,\n"
                "it is skipped unless some list entry starts with ~");
 DEFINE_int32(gpuFrameLag, 5, "If unknown, estimated maximum number of frames GPU allows to lag.");
 DEFINE_int32(samples, 10, "Number of samples to measure for each bench.");
 DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this.");
 DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str());
 DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU.");
 DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs.");
 DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together");
 DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process");

 static SkString humanize(double ms) {
     return HumanizeMs(ms);
 }
 #define HUMANIZE(ms) humanize(ms).c_str()

 namespace kilobench {
 class BenchmarkStream {
 public:
     BenchmarkStream() : fCurrentSKP(0) {
         for (int i = 0; i < FLAGS_skps.count(); i++) {
             if (SkStrEndsWith(FLAGS_skps[i], ".skp")) {
                 fSKPs.push_back() = FLAGS_skps[i];
             } else {
                 SkOSFile::Iter it(FLAGS_skps[i], ".skp");
                 SkString path;
                 while (it.next(&path)) {
                     fSKPs.push_back() = SkOSPath::Join(FLAGS_skps[0], path.c_str());
                 }
             }
         }
     }

     Benchmark* next() {
         Benchmark* bench = nullptr;
         // skips non matching benches
         while ((bench = this->innerNext()) &&
                (SkCommandLineFlags::ShouldSkip(FLAGS_match, bench->getUniqueName()) ||
                 !bench->isSuitableFor(Benchmark::kGPU_Backend))) {
             delete bench;
         }
         return bench;
     }

 private:
     static bool ReadPicture(const char* path, SkAutoTUnref<SkPicture>* pic) {
         // Not strictly necessary, as it will be checked again later,
         // but helps to avoid a lot of pointless work if we're going to skip it.
         if (SkCommandLineFlags::ShouldSkip(FLAGS_match, path)) {
             return false;
         }

         SkAutoTDelete<SkStream> stream(SkStream::NewFromFile(path));
         if (stream.get() == nullptr) {
             SkDebugf("Could not read %s.\n", path);
             return false;
         }

         pic->reset(SkPicture::CreateFromStream(stream.get()));
         if (pic->get() == nullptr) {
             SkDebugf("Could not read %s as an SkPicture.\n", path);
             return false;
         }
         return true;
     }

     Benchmark* innerNext() {
         // Render skps
         while (fCurrentSKP < fSKPs.count()) {
             const SkString& path = fSKPs[fCurrentSKP++];
             SkAutoTUnref<SkPicture> pic;
             if (!ReadPicture(path.c_str(), &pic)) {
                 continue;
             }

             SkString name = SkOSPath::Basename(path.c_str());
             return new VisualSKPBench(name.c_str(), pic.get());
         }

         return nullptr;
     }

     SkTArray<SkString> fSKPs;
     int fCurrentSKP;
 };

 struct GPUTarget {
     void setup() {
         fGL->makeCurrent();
         // Make sure we're done with whatever came before.
         SK_GL(*fGL, Finish());
     }

     SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; }

     void endTiming(bool usePlatformSwapBuffers) {
         if (fGL) {
             SK_GL(*fGL, Flush());
             if (usePlatformSwapBuffers) {
                 fGL->swapBuffers();
             } else {
                 fGL->waitOnSyncOrSwap();
             }
         }
     }
     void finish() {
         SK_GL(*fGL, Finish());
     }

     bool needsFrameTiming(int* maxFrameLag) const {
         if (!fGL->getMaxGpuFrameLag(maxFrameLag)) {
             // Frame lag is unknown.
             *maxFrameLag = FLAGS_gpuFrameLag;
         }
         return true;
     }

     bool init(Benchmark* bench, GrContextFactory* factory, bool useDfText,
               GrContextFactory::GLContextType ctxType,
               GrContextFactory::GLContextOptions ctxOptions, int numSamples) {
         GrContext* context = factory->get(ctxType, ctxOptions);
         int maxRTSize = context->caps()->maxRenderTargetSize();
         SkImageInfo info = SkImageInfo::Make(SkTMin(bench->getSize().fX, maxRTSize),
                                              SkTMin(bench->getSize().fY, maxRTSize),
                                               kN32_SkColorType, kPremul_SkAlphaType);
         uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag :
                                                   0;
         SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType);
         fSurface.reset(SkSurface::NewRenderTarget(context,
                                                   SkBudgeted::kNo, info,
                                                   numSamples, &props));
         fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext;
         if (!fSurface.get()) {
             return false;
         }

         // Kilobench should only be used on platforms with fence sync support
         SkASSERT(fGL->fenceSyncSupport());
         return true;
     }

     SkCanvas* getCanvas() const {
         if (!fSurface.get()) {
             return nullptr;
         }
         return fSurface->getCanvas();
     }

     bool capturePixels(SkBitmap* bmp) {
         SkCanvas* canvas = this->getCanvas();
         if (!canvas) {
             return false;
         }
         bmp->setInfo(canvas->imageInfo());
         if (!canvas->readPixels(bmp, 0, 0)) {
             SkDebugf("Can't read canvas pixels.\n");
             return false;
         }
         return true;
     }

     SkGLContext* gl() { return fGL; }

 private:
     SkGLContext* fGL;
     SkAutoTDelete<SkSurface> fSurface;
 };

 static bool write_canvas_png(GPUTarget* target, const SkString& filename) {

     if (filename.isEmpty()) {
         return false;
     }
     if (target->getCanvas() &&
         kUnknown_SkColorType == target->getCanvas()->imageInfo().colorType()) {
         return false;
     }

     SkBitmap bmp;

     if (!target->capturePixels(&bmp)) {
         return false;
     }

     SkString dir = SkOSPath::Dirname(filename.c_str());
     if (!sk_mkdir(dir.c_str())) {
         SkDebugf("Can't make dir %s.\n", dir.c_str());
         return false;
     }
     SkFILEWStream stream(filename.c_str());
     if (!stream.isValid()) {
         SkDebugf("Can't write %s.\n", filename.c_str());
         return false;
     }
     if (!SkImageEncoder::EncodeStream(&stream, bmp, SkImageEncoder::kPNG_Type, 100)) {
         SkDebugf("Can't encode a PNG.\n");
         return false;
     }
     return true;
 }

 static int detect_forever_loops(int loops) {
     // look for a magic run-forever value
     if (loops < 0) {
         loops = SK_MaxS32;
     }
     return loops;
 }

 static int clamp_loops(int loops) {
     if (loops < 1) {
         SkDebugf("ERROR: clamping loops from %d to 1. "
                  "There's probably something wrong with the bench.\n", loops);
         return 1;
     }
     if (loops > FLAGS_maxLoops) {
         SkDebugf("WARNING: clamping loops from %d to FLAGS_maxLoops, %d.\n", loops, FLAGS_maxLoops);
         return FLAGS_maxLoops;
     }
     return loops;
 }

 static double now_ms() { return SkTime::GetNSecs() * 1e-6; }

 struct TimingThread {
     TimingThread(SkGLContext* mainContext)
         : fFenceSync(mainContext->fenceSync())
         ,  fMainContext(mainContext)
         ,  fDone(false) {}

     static void Loop(void* data) {
         TimingThread* timingThread = reinterpret_cast<TimingThread*>(data);
         timingThread->timingLoop();
     }

     // To ensure waiting for the sync actually does something, we check to make sure the we exceed
     // some small value
     const double kMinElapsed = 1e-6;
     bool sanity(double start) const {
         double elapsed = now_ms() - start;
         return elapsed > kMinElapsed;
     }

     void waitFence(SkPlatformGpuFence sync) {
         SkDEBUGCODE(double start = now_ms());
         fFenceSync->waitFence(sync, false);
         SkASSERT(sanity(start));
     }

     void timingLoop() {
         // Create a context which shares display lists with the main thread
         SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard,
                                                                        fMainContext));
         glContext->makeCurrent();

         // Basic timing methodology is:
         // 1) Wait on semaphore until main thread indicates its time to start timing the frame
         // 2) Wait on frame start sync, record time.  This is start of the frame.
         // 3) Wait on semaphore until main thread indicates its time to finish timing the frame
         // 4) Wait on frame end sync, record time.  FrameEndTime - FrameStartTime = frame time
         // 5) Wait on semaphore until main thread indicates we should time the next frame or quit
         while (true) {
             fSemaphore.wait();

             // get start sync
             SkPlatformGpuFence startSync = this->popStartSync();

             // wait on sync
             this->waitFence(startSync);
             double start = kilobench::now_ms();

             // do we want to sleep here?
             // wait for end sync
             fSemaphore.wait();

             // get end sync
             SkPlatformGpuFence endSync = this->popEndSync();

             // wait on sync
             this->waitFence(endSync);
             double elapsed = kilobench::now_ms() - start;

             // No mutex needed, client won't touch timings until we're done
             fTimings.push_back(elapsed);

             // clean up fences
             fFenceSync->deleteFence(startSync);
             fFenceSync->deleteFence(endSync);

             fSemaphore.wait();
             if (this->isDone()) {
                 break;
             }
         }
     }

     void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); }

     SkPlatformGpuFence popStartSync() {
         return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex);
     }

     void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }

     SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }

     void setDone() {
         SkAutoMutexAcquire done(fDoneMutex);
         fDone = true;
         fSemaphore.signal();
     }

     typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue;

     void pushSync(SyncQueue* queue, SkMutex* mutex) {
         SkAutoMutexAcquire am(mutex);
         *queue->addToHead() = fFenceSync->insertFence();
         fSemaphore.signal();
     }

     SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) {
         SkAutoMutexAcquire am(mutex);
         SkPlatformGpuFence sync = *queue->head();
         queue->popHead();
         return sync;
     }

     bool isDone() {
         SkAutoMutexAcquire am1(fFrameStartSyncsMutex);
         SkAutoMutexAcquire done(fDoneMutex);
         if (fDone && fFrameStartSyncs.isEmpty()) {
             return true;
         } else {
             return false;
         }
     }

     const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; }

 private:
     SkGpuFenceSync* fFenceSync;
     SkSemaphore fSemaphore;
     SkMutex fFrameStartSyncsMutex;
     SyncQueue fFrameStartSyncs;
     SkMutex fFrameEndSyncsMutex;
     SyncQueue fFrameEndSyncs;
     SkTArray<double> fTimings;
     SkMutex fDoneMutex;
     SkGLContext* fMainContext;
     bool fDone;
 };

 static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) {
     SkCanvas* canvas = target->getCanvas();
     canvas->clear(SK_ColorWHITE);
     bench->preDraw(canvas);

     if (timingThread) {
         timingThread->pushStartSync();
     }
     double start = now_ms();
     canvas = target->beginTiming(canvas);
     bench->draw(loops, canvas);
     canvas->flush();
     target->endTiming(timingThread ? true : false);

     double elapsed = now_ms() - start;
     if (timingThread) {
         timingThread->pushEndSync();
         timingThread->setDone();
     }
     bench->postDraw(canvas);
     return elapsed;
 }

 // TODO For now we don't use the background timing thread to tune loops
 static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) {
     // First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs.
     int loops = bench->calculateLoops(FLAGS_loops);
     if (kAutoTuneLoops == loops) {
         loops = 1;
         double elapsed = 0;
         do {
             if (1<<30 == loops) {
                 // We're about to wrap.  Something's wrong with the bench.
                 loops = 0;
                 break;
             }
             loops *= 2;
             // If the GPU lets frames lag at all, we need to make sure we're timing
             // _this_ round, not still timing last round.
             for (int i = 0; i < maxGpuFrameLag; i++) {
                 elapsed = time(loops, bench, target, nullptr);
             }
         } while (elapsed < FLAGS_gpuMs);

         // We've overshot at least a little.  Scale back linearly.
         loops = (int)ceil(loops * FLAGS_gpuMs / elapsed);
         loops = clamp_loops(loops);

         // Make sure we're not still timing our calibration.
         target->finish();
     } else {
         loops = detect_forever_loops(loops);
     }

     // Pretty much the same deal as the calibration: do some warmup to make
     // sure we're timing steady-state pipelined frames.
     for (int i = 0; i < maxGpuFrameLag - 1; i++) {
         time(loops, bench, target, nullptr);
     }

     return loops;
 }

 struct AutoSetupContextBenchAndTarget {
     AutoSetupContextBenchAndTarget(Benchmark* bench) : fBenchmark(bench) {
         GrContextOptions grContextOpts;
         fCtxFactory.reset(new GrContextFactory(grContextOpts));

         SkAssertResult(fTarget.init(bench, fCtxFactory, false,
                                     GrContextFactory::kNative_GLContextType,
                                     GrContextFactory::kNone_GLContextOptions, 0));

         fCanvas = fTarget.getCanvas();
         fTarget.setup();

         bench->perCanvasPreDraw(fCanvas);
         fTarget.needsFrameTiming(&fMaxFrameLag);
     }

     int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); }

     double timeSample(int loops, TimingThread* timingThread) {
         for (int i = 0; i < fMaxFrameLag; i++) {
             time(loops, fBenchmark, &fTarget, timingThread);
         }

         return time(loops, fBenchmark, &fTarget, timingThread) / loops;
     }

     void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); }

     SkAutoTDelete<GrContextFactory> fCtxFactory;
     GPUTarget fTarget;
     SkCanvas* fCanvas;
     Benchmark* fBenchmark;
     int fMaxFrameLag;
 };

 int setup_loops(Benchmark* bench) {
     AutoSetupContextBenchAndTarget ascbt(bench);
     int loops = ascbt.getLoops();
     ascbt.teardownBench();

     if (!FLAGS_writePath.isEmpty() && FLAGS_writePath[0]) {
         SkString pngFilename = SkOSPath::Join(FLAGS_writePath[0], "gpu");
         pngFilename = SkOSPath::Join(pngFilename.c_str(), bench->getUniqueName());
         pngFilename.append(".png");
         write_canvas_png(&ascbt.fTarget, pngFilename);
     }
     return loops;
 }

 struct Sample {
     double fCpu;
     double fGpu;
 };

 Sample time_sample(Benchmark* bench, int loops) {
     AutoSetupContextBenchAndTarget ascbt(bench);

     Sample sample;
     if (FLAGS_useBackgroundThread) {
         TimingThread timingThread(ascbt.fTarget.gl());
         SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread));
         nativeThread->start();
         sample.fCpu = ascbt.timeSample(loops, &timingThread);
         nativeThread->join();

         // return the min
         double min = SK_ScalarMax;
         for (int i = 0; i < timingThread.timings().count(); i++) {
             min = SkTMin(min, timingThread.timings()[i]);
         }
         sample.fGpu = min;
     } else {
         sample.fCpu = ascbt.timeSample(loops, nullptr);
     }

     ascbt.teardownBench();

     return sample;
 }

 } // namespace kilobench

 static const int kOutResultSize = 1024;

 void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) {
     SkString newName(name);
     newName.appendf("_%s", mod);
     Stats stats(samples);
     const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;
     SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"
         , loops
         , HUMANIZE(stats.min)
         , HUMANIZE(stats.median)
         , HUMANIZE(stats.mean)
         , HUMANIZE(stats.max)
         , stddev_percent
         , stats.plot.c_str()
         , "gpu"
         , newName.c_str()
     );
 }

 int kilobench_main() {
     kilobench::BenchmarkStream benchStream;

     SkDebugf("loops\tmin\tmedian\tmean\tmax\tstddev\t%-*s\tconfig\tbench\n",
              FLAGS_samples, "samples");

     int descriptors[2];
     if (pipe(descriptors) != 0) {
         SkFAIL("Failed to open a pipe\n");
     }

     while (Benchmark* b = benchStream.next()) {
         SkAutoTDelete<Benchmark> bench(b);

         int loops = 1;
         SkTArray<double> cpuSamples;
         SkTArray<double> gpuSamples;
         for (int i = 0; i < FLAGS_samples + 1; i++) {
             // We fork off a new process to setup the grcontext and run the test while we wait
             if (FLAGS_useMultiProcess) {
                 int childPid = fork();
                 if (childPid > 0) {
                     char result[kOutResultSize];
                     if (read(descriptors[0], result, kOutResultSize) < 0) {
                          SkFAIL("Failed to read from pipe\n");
                     }

                     // if samples == 0 then parse # of loops
                     // else parse float
                     if (i == 0) {
                         sscanf(result, "%d", &loops);
                     } else {
                         sscanf(result, "%lf %lf", &cpuSamples.push_back(),
                                                   &gpuSamples.push_back());
                     }

                     // wait until exit
                     int status;
                     waitpid(childPid, &status, 0);
                 } else if (0 == childPid) {
                     char result[kOutResultSize];
                     if (i == 0) {
                         sprintf(result, "%d", kilobench::setup_loops(bench));
                     } else {
                         kilobench::Sample sample = kilobench::time_sample(bench, loops);
                         sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu);
                     }

                     // Make sure to write the null terminator
                     if (write(descriptors[1], result, strlen(result) + 1) < 0) {
                         SkFAIL("Failed to write to pipe\n");
                     }
                     return 0;
                 } else {
                     SkFAIL("Fork failed\n");
                 }
             } else {
                 if (i == 0) {
                     loops = kilobench::setup_loops(bench);
                 } else {
                     kilobench::Sample sample = kilobench::time_sample(bench, loops);
                     cpuSamples.push_back(sample.fCpu);
                     gpuSamples.push_back(sample.fGpu);
                 }
             }
         }

         printResult(cpuSamples, loops, bench->getUniqueName(), "cpu");
         if (FLAGS_useBackgroundThread) {
             printResult(gpuSamples, loops, bench->getUniqueName(), "gpu");
         }
     }
     return 0;
 }

 #if !defined SK_BUILD_FOR_IOS
 int main(int argc, char** argv) {
     SkCommandLineFlags::Parse(argc, argv);
     return kilobench_main();
 }
 #endif
	/*
	* Copyright 2016 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#include "GrCaps.h"
	#include "GrContextFactory.h"
	#include "Benchmark.h"
	#include "ResultsWriter.h"
	#include "SkCommandLineFlags.h"
	#include "SkOSFile.h"
	#include "SkStream.h"
	#include "SkSurface.h"
	#include "SkTime.h"
	#include "SkTLList.h"
	#include "SkThreadUtils.h"
	#include "Stats.h"
	#include "Timer.h"
	#include "VisualSKPBench.h"
	#include "gl/GrGLDefines.h"
	#include "../private/SkMutex.h"
	#include "../private/SkSemaphore.h"
	#include "../private/SkGpuFenceSync.h"

	// posix only for now
	#include <unistd.h>
	#include <sys/types.h>
	#include <sys/wait.h>

	/*
	* This is an experimental GPU only benchmarking program. The initial implementation will only
	* support SKPs.
	*/

	// To get image decoders linked in we have to do the below magic
	#include "SkForceLinking.h"
	#include "SkImageDecoder.h"
	__SK_FORCE_IMAGE_DECODER_LINKING;

	static const int kAutoTuneLoops = 0;

	static const int kDefaultLoops =
	#ifdef SK_DEBUG
	1;
	#else
	kAutoTuneLoops;
	#endif

	static SkString loops_help_txt() {
	SkString help;
	help.printf("Number of times to run each bench. Set this to %d to auto-"
	"tune for each bench. Timings are only reported when auto-tuning.",
	kAutoTuneLoops);
	return help;
	}

	DEFINE_string(skps, "skps", "Directory to read skps from.");
	DEFINE_string2(match, m, nullptr,
	"[~][^]substring[$] [...] of GM name to run.\n"
	"Multiple matches may be separated by spaces.\n"
	"~ causes a matching bench to always be skipped\n"
	"^ requires the start of the bench to match\n"
	"$ requires the end of the bench to match\n"
	"^ and $ requires an exact match\n"
	"If a bench does not match any list entry,\n"
	"it is skipped unless some list entry starts with ~");
	DEFINE_int32(gpuFrameLag, 5, "If unknown, estimated maximum number of frames GPU allows to lag.");
	DEFINE_int32(samples, 10, "Number of samples to measure for each bench.");
	DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this.");
	DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str());
	DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU.");
	DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs.");
	DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together");
	DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process");

	static SkString humanize(double ms) {
	return HumanizeMs(ms);
	}
	#define HUMANIZE(ms) humanize(ms).c_str()

	namespace kilobench {
	class BenchmarkStream {
	public:
	BenchmarkStream() : fCurrentSKP(0) {
	for (int i = 0; i < FLAGS_skps.count(); i++) {
	if (SkStrEndsWith(FLAGS_skps[i], ".skp")) {
	fSKPs.push_back() = FLAGS_skps[i];
	} else {
	SkOSFile::Iter it(FLAGS_skps[i], ".skp");
	SkString path;
	while (it.next(&path)) {
	fSKPs.push_back() = SkOSPath::Join(FLAGS_skps[0], path.c_str());
	}
	}
	}
	}

	Benchmark* next() {
	Benchmark* bench = nullptr;
	// skips non matching benches
	while ((bench = this->innerNext()) &&
	(SkCommandLineFlags::ShouldSkip(FLAGS_match, bench->getUniqueName()) \|\|
	!bench->isSuitableFor(Benchmark::kGPU_Backend))) {
	delete bench;
	}
	return bench;
	}

	private:
	static bool ReadPicture(const char* path, SkAutoTUnref<SkPicture>* pic) {
	// Not strictly necessary, as it will be checked again later,
	// but helps to avoid a lot of pointless work if we're going to skip it.
	if (SkCommandLineFlags::ShouldSkip(FLAGS_match, path)) {
	return false;
	}

	SkAutoTDelete<SkStream> stream(SkStream::NewFromFile(path));
	if (stream.get() == nullptr) {
	SkDebugf("Could not read %s.\n", path);
	return false;
	}

	pic->reset(SkPicture::CreateFromStream(stream.get()));
	if (pic->get() == nullptr) {
	SkDebugf("Could not read %s as an SkPicture.\n", path);
	return false;
	}
	return true;
	}

	Benchmark* innerNext() {
	// Render skps
	while (fCurrentSKP < fSKPs.count()) {
	const SkString& path = fSKPs[fCurrentSKP++];
	SkAutoTUnref<SkPicture> pic;
	if (!ReadPicture(path.c_str(), &pic)) {
	continue;
	}

	SkString name = SkOSPath::Basename(path.c_str());
	return new VisualSKPBench(name.c_str(), pic.get());
	}

	return nullptr;
	}

	SkTArray<SkString> fSKPs;
	int fCurrentSKP;
	};

	struct GPUTarget {
	void setup() {
	fGL->makeCurrent();
	// Make sure we're done with whatever came before.
	SK_GL(*fGL, Finish());
	}

	SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; }

	void endTiming(bool usePlatformSwapBuffers) {
	if (fGL) {
	SK_GL(*fGL, Flush());
	if (usePlatformSwapBuffers) {
	fGL->swapBuffers();
	} else {
	fGL->waitOnSyncOrSwap();
	}
	}
	}
	void finish() {
	SK_GL(*fGL, Finish());
	}

	bool needsFrameTiming(int* maxFrameLag) const {
	if (!fGL->getMaxGpuFrameLag(maxFrameLag)) {
	// Frame lag is unknown.
	*maxFrameLag = FLAGS_gpuFrameLag;
	}
	return true;
	}

	bool init(Benchmark* bench, GrContextFactory* factory, bool useDfText,
	GrContextFactory::GLContextType ctxType,
	GrContextFactory::GLContextOptions ctxOptions, int numSamples) {
	GrContext* context = factory->get(ctxType, ctxOptions);
	int maxRTSize = context->caps()->maxRenderTargetSize();
	SkImageInfo info = SkImageInfo::Make(SkTMin(bench->getSize().fX, maxRTSize),
	SkTMin(bench->getSize().fY, maxRTSize),
	kN32_SkColorType, kPremul_SkAlphaType);
	uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag :
	0;
	SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType);
	fSurface.reset(SkSurface::NewRenderTarget(context,
	SkBudgeted::kNo, info,
	numSamples, &props));
	fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext;
	if (!fSurface.get()) {
	return false;
	}

	// Kilobench should only be used on platforms with fence sync support
	SkASSERT(fGL->fenceSyncSupport());
	return true;
	}

	SkCanvas* getCanvas() const {
	if (!fSurface.get()) {
	return nullptr;
	}
	return fSurface->getCanvas();
	}

	bool capturePixels(SkBitmap* bmp) {
	SkCanvas* canvas = this->getCanvas();
	if (!canvas) {
	return false;
	}
	bmp->setInfo(canvas->imageInfo());
	if (!canvas->readPixels(bmp, 0, 0)) {
	SkDebugf("Can't read canvas pixels.\n");
	return false;
	}
	return true;
	}

	SkGLContext* gl() { return fGL; }

	private:
	SkGLContext* fGL;
	SkAutoTDelete<SkSurface> fSurface;
	};

	static bool write_canvas_png(GPUTarget* target, const SkString& filename) {

	if (filename.isEmpty()) {
	return false;
	}
	if (target->getCanvas() &&
	kUnknown_SkColorType == target->getCanvas()->imageInfo().colorType()) {
	return false;
	}

	SkBitmap bmp;

	if (!target->capturePixels(&bmp)) {
	return false;
	}

	SkString dir = SkOSPath::Dirname(filename.c_str());
	if (!sk_mkdir(dir.c_str())) {
	SkDebugf("Can't make dir %s.\n", dir.c_str());
	return false;
	}
	SkFILEWStream stream(filename.c_str());
	if (!stream.isValid()) {
	SkDebugf("Can't write %s.\n", filename.c_str());
	return false;
	}
	if (!SkImageEncoder::EncodeStream(&stream, bmp, SkImageEncoder::kPNG_Type, 100)) {
	SkDebugf("Can't encode a PNG.\n");
	return false;
	}
	return true;
	}

	static int detect_forever_loops(int loops) {
	// look for a magic run-forever value
	if (loops < 0) {
	loops = SK_MaxS32;
	}
	return loops;
	}

	static int clamp_loops(int loops) {
	if (loops < 1) {
	SkDebugf("ERROR: clamping loops from %d to 1. "
	"There's probably something wrong with the bench.\n", loops);
	return 1;
	}
	if (loops > FLAGS_maxLoops) {
	SkDebugf("WARNING: clamping loops from %d to FLAGS_maxLoops, %d.\n", loops, FLAGS_maxLoops);
	return FLAGS_maxLoops;
	}
	return loops;
	}

	static double now_ms() { return SkTime::GetNSecs() * 1e-6; }

	struct TimingThread {
	TimingThread(SkGLContext* mainContext)
	: fFenceSync(mainContext->fenceSync())
	, fMainContext(mainContext)
	, fDone(false) {}

	static void Loop(void* data) {
	TimingThread* timingThread = reinterpret_cast<TimingThread*>(data);
	timingThread->timingLoop();
	}

	// To ensure waiting for the sync actually does something, we check to make sure the we exceed
	// some small value
	const double kMinElapsed = 1e-6;
	bool sanity(double start) const {
	double elapsed = now_ms() - start;
	return elapsed > kMinElapsed;
	}

	void waitFence(SkPlatformGpuFence sync) {
	SkDEBUGCODE(double start = now_ms());
	fFenceSync->waitFence(sync, false);
	SkASSERT(sanity(start));
	}

	void timingLoop() {
	// Create a context which shares display lists with the main thread
	SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard,
	fMainContext));
	glContext->makeCurrent();

	// Basic timing methodology is:
	// 1) Wait on semaphore until main thread indicates its time to start timing the frame
	// 2) Wait on frame start sync, record time. This is start of the frame.
	// 3) Wait on semaphore until main thread indicates its time to finish timing the frame
	// 4) Wait on frame end sync, record time. FrameEndTime - FrameStartTime = frame time
	// 5) Wait on semaphore until main thread indicates we should time the next frame or quit
	while (true) {
	fSemaphore.wait();

	// get start sync
	SkPlatformGpuFence startSync = this->popStartSync();

	// wait on sync
	this->waitFence(startSync);
	double start = kilobench::now_ms();

	// do we want to sleep here?
	// wait for end sync
	fSemaphore.wait();

	// get end sync
	SkPlatformGpuFence endSync = this->popEndSync();

	// wait on sync
	this->waitFence(endSync);
	double elapsed = kilobench::now_ms() - start;

	// No mutex needed, client won't touch timings until we're done
	fTimings.push_back(elapsed);

	// clean up fences
	fFenceSync->deleteFence(startSync);
	fFenceSync->deleteFence(endSync);

	fSemaphore.wait();
	if (this->isDone()) {
	break;
	}
	}
	}

	void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); }

	SkPlatformGpuFence popStartSync() {
	return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex);
	}

	void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }

	SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }

	void setDone() {
	SkAutoMutexAcquire done(fDoneMutex);
	fDone = true;
	fSemaphore.signal();
	}

	typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue;

	void pushSync(SyncQueue* queue, SkMutex* mutex) {
	SkAutoMutexAcquire am(mutex);
	*queue->addToHead() = fFenceSync->insertFence();
	fSemaphore.signal();
	}

	SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) {
	SkAutoMutexAcquire am(mutex);
	SkPlatformGpuFence sync = *queue->head();
	queue->popHead();
	return sync;
	}

	bool isDone() {
	SkAutoMutexAcquire am1(fFrameStartSyncsMutex);
	SkAutoMutexAcquire done(fDoneMutex);
	if (fDone && fFrameStartSyncs.isEmpty()) {
	return true;
	} else {
	return false;
	}
	}

	const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; }

	private:
	SkGpuFenceSync* fFenceSync;
	SkSemaphore fSemaphore;
	SkMutex fFrameStartSyncsMutex;
	SyncQueue fFrameStartSyncs;
	SkMutex fFrameEndSyncsMutex;
	SyncQueue fFrameEndSyncs;
	SkTArray<double> fTimings;
	SkMutex fDoneMutex;
	SkGLContext* fMainContext;
	bool fDone;
	};

	static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) {
	SkCanvas* canvas = target->getCanvas();
	canvas->clear(SK_ColorWHITE);
	bench->preDraw(canvas);

	if (timingThread) {
	timingThread->pushStartSync();
	}
	double start = now_ms();
	canvas = target->beginTiming(canvas);
	bench->draw(loops, canvas);
	canvas->flush();
	target->endTiming(timingThread ? true : false);

	double elapsed = now_ms() - start;
	if (timingThread) {
	timingThread->pushEndSync();
	timingThread->setDone();
	}
	bench->postDraw(canvas);
	return elapsed;
	}

	// TODO For now we don't use the background timing thread to tune loops
	static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) {
	// First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs.
	int loops = bench->calculateLoops(FLAGS_loops);
	if (kAutoTuneLoops == loops) {
	loops = 1;
	double elapsed = 0;
	do {
	if (1<<30 == loops) {
	// We're about to wrap. Something's wrong with the bench.
	loops = 0;
	break;
	}
	loops *= 2;
	// If the GPU lets frames lag at all, we need to make sure we're timing
	// _this_ round, not still timing last round.
	for (int i = 0; i < maxGpuFrameLag; i++) {
	elapsed = time(loops, bench, target, nullptr);
	}
	} while (elapsed < FLAGS_gpuMs);

	// We've overshot at least a little. Scale back linearly.
	loops = (int)ceil(loops * FLAGS_gpuMs / elapsed);
	loops = clamp_loops(loops);

	// Make sure we're not still timing our calibration.
	target->finish();
	} else {
	loops = detect_forever_loops(loops);
	}

	// Pretty much the same deal as the calibration: do some warmup to make
	// sure we're timing steady-state pipelined frames.
	for (int i = 0; i < maxGpuFrameLag - 1; i++) {
	time(loops, bench, target, nullptr);
	}

	return loops;
	}

	struct AutoSetupContextBenchAndTarget {
	AutoSetupContextBenchAndTarget(Benchmark* bench) : fBenchmark(bench) {
	GrContextOptions grContextOpts;
	fCtxFactory.reset(new GrContextFactory(grContextOpts));

	SkAssertResult(fTarget.init(bench, fCtxFactory, false,
	GrContextFactory::kNative_GLContextType,
	GrContextFactory::kNone_GLContextOptions, 0));

	fCanvas = fTarget.getCanvas();
	fTarget.setup();

	bench->perCanvasPreDraw(fCanvas);
	fTarget.needsFrameTiming(&fMaxFrameLag);
	}

	int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); }

	double timeSample(int loops, TimingThread* timingThread) {
	for (int i = 0; i < fMaxFrameLag; i++) {
	time(loops, fBenchmark, &fTarget, timingThread);
	}

	return time(loops, fBenchmark, &fTarget, timingThread) / loops;
	}

	void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); }

	SkAutoTDelete<GrContextFactory> fCtxFactory;
	GPUTarget fTarget;
	SkCanvas* fCanvas;
	Benchmark* fBenchmark;
	int fMaxFrameLag;
	};

	int setup_loops(Benchmark* bench) {
	AutoSetupContextBenchAndTarget ascbt(bench);
	int loops = ascbt.getLoops();
	ascbt.teardownBench();

	if (!FLAGS_writePath.isEmpty() && FLAGS_writePath[0]) {
	SkString pngFilename = SkOSPath::Join(FLAGS_writePath[0], "gpu");
	pngFilename = SkOSPath::Join(pngFilename.c_str(), bench->getUniqueName());
	pngFilename.append(".png");
	write_canvas_png(&ascbt.fTarget, pngFilename);
	}
	return loops;
	}

	struct Sample {
	double fCpu;
	double fGpu;
	};

	Sample time_sample(Benchmark* bench, int loops) {
	AutoSetupContextBenchAndTarget ascbt(bench);

	Sample sample;
	if (FLAGS_useBackgroundThread) {
	TimingThread timingThread(ascbt.fTarget.gl());
	SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread));
	nativeThread->start();
	sample.fCpu = ascbt.timeSample(loops, &timingThread);
	nativeThread->join();

	// return the min
	double min = SK_ScalarMax;
	for (int i = 0; i < timingThread.timings().count(); i++) {
	min = SkTMin(min, timingThread.timings()[i]);
	}
	sample.fGpu = min;
	} else {
	sample.fCpu = ascbt.timeSample(loops, nullptr);
	}

	ascbt.teardownBench();

	return sample;
	}

	} // namespace kilobench

	static const int kOutResultSize = 1024;

	void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) {
	SkString newName(name);
	newName.appendf("_%s", mod);
	Stats stats(samples);
	const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;
	SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"
	, loops
	, HUMANIZE(stats.min)
	, HUMANIZE(stats.median)
	, HUMANIZE(stats.mean)
	, HUMANIZE(stats.max)
	, stddev_percent
	, stats.plot.c_str()
	, "gpu"
	, newName.c_str()
	);
	}

	int kilobench_main() {
	kilobench::BenchmarkStream benchStream;

	SkDebugf("loops\tmin\tmedian\tmean\tmax\tstddev\t%-*s\tconfig\tbench\n",
	FLAGS_samples, "samples");

	int descriptors[2];
	if (pipe(descriptors) != 0) {
	SkFAIL("Failed to open a pipe\n");
	}

	while (Benchmark* b = benchStream.next()) {
	SkAutoTDelete<Benchmark> bench(b);

	int loops = 1;
	SkTArray<double> cpuSamples;
	SkTArray<double> gpuSamples;
	for (int i = 0; i < FLAGS_samples + 1; i++) {
	// We fork off a new process to setup the grcontext and run the test while we wait
	if (FLAGS_useMultiProcess) {
	int childPid = fork();
	if (childPid > 0) {
	char result[kOutResultSize];
	if (read(descriptors[0], result, kOutResultSize) < 0) {
	SkFAIL("Failed to read from pipe\n");
	}

	// if samples == 0 then parse # of loops
	// else parse float
	if (i == 0) {
	sscanf(result, "%d", &loops);
	} else {
	sscanf(result, "%lf %lf", &cpuSamples.push_back(),
	&gpuSamples.push_back());
	}

	// wait until exit
	int status;
	waitpid(childPid, &status, 0);
	} else if (0 == childPid) {
	char result[kOutResultSize];
	if (i == 0) {
	sprintf(result, "%d", kilobench::setup_loops(bench));
	} else {
	kilobench::Sample sample = kilobench::time_sample(bench, loops);
	sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu);
	}

	// Make sure to write the null terminator
	if (write(descriptors[1], result, strlen(result) + 1) < 0) {
	SkFAIL("Failed to write to pipe\n");
	}
	return 0;
	} else {
	SkFAIL("Fork failed\n");
	}
	} else {
	if (i == 0) {
	loops = kilobench::setup_loops(bench);
	} else {
	kilobench::Sample sample = kilobench::time_sample(bench, loops);
	cpuSamples.push_back(sample.fCpu);
	gpuSamples.push_back(sample.fGpu);
	}
	}
	}

	printResult(cpuSamples, loops, bench->getUniqueName(), "cpu");
	if (FLAGS_useBackgroundThread) {
	printResult(gpuSamples, loops, bench->getUniqueName(), "gpu");
	}
	}
	return 0;
	}

	#if !defined SK_BUILD_FOR_IOS
	int main(int argc, char** argv) {
	SkCommandLineFlags::Parse(argc, argv);
	return kilobench_main();
	}
	#endif