Fix HotSort benchmark and typo

Bug: skia:
Change-Id: I1805a1e08e87a898f6d00ab86daa3fac1d3d5916
Reviewed-on: https://skia-review.googlesource.com/149031
Reviewed-by: Allan MacKinnon <allanmac@google.com>
Commit-Queue: Allan MacKinnon <allanmac@google.com>
Auto-Submit: Allan MacKinnon <allanmac@google.com>
diff --git a/src/compute/common/macros.h b/src/compute/common/macros.h
index 9528568..d65c126 100644
--- a/src/compute/common/macros.h
+++ b/src/compute/common/macros.h
@@ -64,8 +64,8 @@
 //
 //
 
-#define EVAL_MACRO(x)           x
-#define CONCAT_MACRO(a,b)       EVAL_MACRO(a)##EVAL_MACRO(b)
+#define CONCAT_MACRO_2(a,b)     a ## b
+#define CONCAT_MACRO(a,b)       CONCAT_MACRO_2(a,b)
 
 //
 // Convert byte pointer to a network order 32-bit integer to host
diff --git a/src/compute/hs/README.md b/src/compute/hs/README.md
index 46e4368..753fa08 100644
--- a/src/compute/hs/README.md
+++ b/src/compute/hs/README.md
@@ -2,7 +2,7 @@
 # HotSort
 
 HotSort is a high-performance GPU-accelerated integer sorting library
-for Vulkan, CUDA and OpenCL GPUs.
+for Vulkan, CUDA and OpenCL compute APIs.
 
 HotSort's advantages include:
 
@@ -14,16 +14,15 @@
 * A concurrency-friendly dense kernel grid
 * Support for GPU post-processing of sorted results
 
-The HotSort sorting kernels are typically significantly faster than
-other GPU sorting implementations when sorting arrays of smaller than
-500K-2M keys.
+HotSort is typically significantly faster than other GPU-accelerated
+implementations when sorting arrays of smaller than 500K-2M keys.
 
 ## Benchmarks
 
 ### Throughput
 
 Here are results for HotSort on Vulkan and CUDA sorting 32-bit and
-64-bit keys on a 640 core Quadro M1200:
+64-bit keys on a 640-core Quadro M1200:
 
 ![](images/hs_nvidia_sm35_u32_mkeys.svg)
 ![](images/hs_nvidia_sm35_u64_mkeys.svg)
diff --git a/src/compute/hs/cuda/bench/main.c b/src/compute/hs/cuda/bench/main.c
index 579d48f..221b451 100644
--- a/src/compute/hs/cuda/bench/main.c
+++ b/src/compute/hs/cuda/bench/main.c
@@ -1,5 +1,3 @@
-// -*- compile-command: "nvcc -arch sm_50 -D NDEBUG -I ../../.. -o hs_bench_cuda main.c sort.cpp ../../../common/cuda/assert_cuda.c ../../../common/util.c ../sm_35/u32/hs_cuda_u32.cu ../sm_35/u64/hs_cuda_u64.cu"; -*-
-
 /*
  * Copyright 2016 Google Inc.
  *
@@ -265,7 +263,8 @@
          uint32_t              const   count_step,
          uint32_t              const   loops,
          uint32_t              const   warmup,
-         bool                  const   linearize)
+         bool                  const   linearize,
+         bool                  const   verify)
 {
   //
   // return if nothing to do
@@ -396,42 +395,47 @@
         }
 
       //
-      // copy back the results
-      //
-      size_t const size_padded_in = count_padded_in * key_size;
-
-      cuda(Memcpy(sorted_h,vin_d, size_padded_in,cudaMemcpyDeviceToHost));
-      cuda(Memcpy(vout_h,  vout_d,size_padded_in,cudaMemcpyDeviceToHost));
-
-      //
-      // sort the input with another algorithm
-      //
-      double cpu_ns;
-
-      char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
-
-      // transpose the cpu sorted slabs before comparison
-      if (!linearize) {
-        hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
-      }
-
-      //
       // verify
       //
-      bool const verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
+      char const * cpu_algo = NULL;
+      double       cpu_ns   = 1.0;
+      bool         verified = false;
+
+      if (verify)
+        {
+	  //
+	  // copy back the results
+	  //
+	  size_t const size_padded_in = count_padded_in * key_size;
+
+	  cuda(Memcpy(sorted_h,vin_d, size_padded_in,cudaMemcpyDeviceToHost));
+	  cuda(Memcpy(vout_h,  vout_d,size_padded_in,cudaMemcpyDeviceToHost));
+
+	  //
+	  // sort the input with another algorithm
+	  //
+	  cpu_algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
+
+	  // transpose the cpu sorted slabs before comparison
+	  if (!linearize) {
+	    hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
+	  }
+
+	  verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
 
 #ifndef NDEBUG
-      if (!verified)
-        {
-          if (hs_words == 1) {
-            hs_debug_u32(hs_width,hs_height,vout_h,  count);
-            hs_debug_u32(hs_width,hs_height,sorted_h,count);
-          } else { // ulong
-            hs_debug_u64(hs_width,hs_height,vout_h,  count);
-            hs_debug_u64(hs_width,hs_height,sorted_h,count);
-          }
-        }
+	  if (!verified)
+	    {
+	      if (hs_words == 1) {
+		hs_debug_u32(hs_width,hs_height,vout_h,  count);
+		hs_debug_u32(hs_width,hs_height,sorted_h,count);
+	      } else { // ulong
+		hs_debug_u64(hs_width,hs_height,vout_h,  count);
+		hs_debug_u64(hs_width,hs_height,sorted_h,count);
+	      }
+	    }
 #endif
+	}
 
       //
       // REPORT
@@ -441,14 +445,14 @@
               driver_version,
               (hs_words == 1) ? "uint32_t" : "uint64_t",
               linearize       ? "linear"   : "slab",
-              verified        ? "  OK  "   : "*FAIL*",
+              verify ? (verified ? "  OK  " : "*FAIL*") : "UNVERIFIED",
               count,
               count_padded_in,
               count_padded_out,
               // CPU
-              algo,
-              cpu_ns / 1000000.0,                                   // milliseconds
-              1000.0 * count / cpu_ns,                              // mkeys / sec
+              verify ? cpu_algo : "UNVERIFIED",
+              verify ? (cpu_ns / 1000000.0)      : 0.0,             // milliseconds
+              verify ? (1000.0 * count / cpu_ns) : 0.0,             // mkeys / sec
               // GPU
               loops,
               elapsed_ms_sum / loops,                               // avg msecs
@@ -458,7 +462,7 @@
               (double) count          / (1000.0 * elapsed_ms_min)); // mkeys / sec - max
 
       // quit early if not verified
-      if (!verified)
+      if (verify && !verified)
         break;
     }
 
@@ -564,6 +568,7 @@
   uint32_t const loops      = (argc <= 6) ? HS_BENCH_LOOPS  : strtoul(argv[6],NULL,0);
   uint32_t const warmup     = (argc <= 7) ? HS_BENCH_WARMUP : strtoul(argv[7],NULL,0);
   bool     const linearize  = (argc <= 8) ? true            : strtoul(argv[8],NULL,0);
+  bool     const verify     = (argc <= 9) ? true            : strtoul(argv[9],NULL,0);
 
   //
   // benchmark
@@ -583,7 +588,8 @@
            count_step,
            loops,
            warmup,
-           linearize);
+           linearize,
+	   verify);
 
   //
   // cleanup
diff --git a/src/compute/hs/cuda/bench/sort.cpp b/src/compute/hs/cuda/bench/sort.cpp
index 624b153..1f2781d 100644
--- a/src/compute/hs/cuda/bench/sort.cpp
+++ b/src/compute/hs/cuda/bench/sort.cpp
@@ -51,6 +51,7 @@
 
 #undef  HS_USE_PARALLEL_SORT
 #undef  HS_USE_STD_SORT
+#undef  HS_USE_QSORT
 #define HS_USE_QSORT
 
 #include <stdlib.h>
diff --git a/src/compute/hs/vk/bench/Makefile b/src/compute/hs/vk/bench/Makefile
index b3ff7a0..9771c2f 100644
--- a/src/compute/hs/vk/bench/Makefile
+++ b/src/compute/hs/vk/bench/Makefile
@@ -22,7 +22,13 @@
 #
 #
 
-VULKAN_SDK	=	~/vulkan/1.1.82.0/x86_64
+GCC_OPT		=	-O2 -D NDEBUG
+GPP_OPT		=	-O2 -D NDEBUG -std=c++11
+
+#
+#
+#
+
 VULKAN_INC	=	$(VULKAN_SDK)/include
 
 #
@@ -33,10 +39,10 @@
 		g++ -o $@ $^ $(VULKAN_SDK)/lib/libvulkan.so.1
 
 $(OBJ_C):	$(SRC_C)
-		gcc -D NDEBUG            -c $^ -I ../../.. -I .. -I $(VULKAN_INC)
+		gcc $(GCC_OPT) -c $^ -I ../../.. -I .. -I $(VULKAN_INC)
 
 $(OBJ_CPP):	$(SRC_CPP)
-		g++ -D NDEBUG -std=c++11 -c $^
+		g++ $(GPP_OPT) -c $^
 
 
 .PHONY: clean