Fix HotSort benchmark and typo
Bug: skia:
Change-Id: I1805a1e08e87a898f6d00ab86daa3fac1d3d5916
Reviewed-on: https://skia-review.googlesource.com/149031
Reviewed-by: Allan MacKinnon <allanmac@google.com>
Commit-Queue: Allan MacKinnon <allanmac@google.com>
Auto-Submit: Allan MacKinnon <allanmac@google.com>
diff --git a/src/compute/common/macros.h b/src/compute/common/macros.h
index 9528568..d65c126 100644
--- a/src/compute/common/macros.h
+++ b/src/compute/common/macros.h
@@ -64,8 +64,8 @@
//
//
-#define EVAL_MACRO(x) x
-#define CONCAT_MACRO(a,b) EVAL_MACRO(a)##EVAL_MACRO(b)
+#define CONCAT_MACRO_2(a,b) a ## b
+#define CONCAT_MACRO(a,b) CONCAT_MACRO_2(a,b)
//
// Convert byte pointer to a network order 32-bit integer to host
diff --git a/src/compute/hs/README.md b/src/compute/hs/README.md
index 46e4368..753fa08 100644
--- a/src/compute/hs/README.md
+++ b/src/compute/hs/README.md
@@ -2,7 +2,7 @@
# HotSort
HotSort is a high-performance GPU-accelerated integer sorting library
-for Vulkan, CUDA and OpenCL GPUs.
+for Vulkan, CUDA and OpenCL compute APIs.
HotSort's advantages include:
@@ -14,16 +14,15 @@
* A concurrency-friendly dense kernel grid
* Support for GPU post-processing of sorted results
-The HotSort sorting kernels are typically significantly faster than
-other GPU sorting implementations when sorting arrays of smaller than
-500K-2M keys.
+HotSort is typically significantly faster than other GPU-accelerated
+implementations when sorting arrays of smaller than 500K-2M keys.
## Benchmarks
### Throughput
Here are results for HotSort on Vulkan and CUDA sorting 32-bit and
-64-bit keys on a 640 core Quadro M1200:
+64-bit keys on a 640-core Quadro M1200:


diff --git a/src/compute/hs/cuda/bench/main.c b/src/compute/hs/cuda/bench/main.c
index 579d48f..221b451 100644
--- a/src/compute/hs/cuda/bench/main.c
+++ b/src/compute/hs/cuda/bench/main.c
@@ -1,5 +1,3 @@
-// -*- compile-command: "nvcc -arch sm_50 -D NDEBUG -I ../../.. -o hs_bench_cuda main.c sort.cpp ../../../common/cuda/assert_cuda.c ../../../common/util.c ../sm_35/u32/hs_cuda_u32.cu ../sm_35/u64/hs_cuda_u64.cu"; -*-
-
/*
* Copyright 2016 Google Inc.
*
@@ -265,7 +263,8 @@
uint32_t const count_step,
uint32_t const loops,
uint32_t const warmup,
- bool const linearize)
+ bool const linearize,
+ bool const verify)
{
//
// return if nothing to do
@@ -396,42 +395,47 @@
}
//
- // copy back the results
- //
- size_t const size_padded_in = count_padded_in * key_size;
-
- cuda(Memcpy(sorted_h,vin_d, size_padded_in,cudaMemcpyDeviceToHost));
- cuda(Memcpy(vout_h, vout_d,size_padded_in,cudaMemcpyDeviceToHost));
-
- //
- // sort the input with another algorithm
- //
- double cpu_ns;
-
- char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
-
- // transpose the cpu sorted slabs before comparison
- if (!linearize) {
- hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
- }
-
- //
// verify
//
- bool const verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
+ char const * cpu_algo = NULL;
+ double cpu_ns = 1.0;
+ bool verified = false;
+
+ if (verify)
+ {
+ //
+ // copy back the results
+ //
+ size_t const size_padded_in = count_padded_in * key_size;
+
+ cuda(Memcpy(sorted_h,vin_d, size_padded_in,cudaMemcpyDeviceToHost));
+ cuda(Memcpy(vout_h, vout_d,size_padded_in,cudaMemcpyDeviceToHost));
+
+ //
+ // sort the input with another algorithm
+ //
+ cpu_algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
+
+ // transpose the cpu sorted slabs before comparison
+ if (!linearize) {
+ hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
+ }
+
+ verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
#ifndef NDEBUG
- if (!verified)
- {
- if (hs_words == 1) {
- hs_debug_u32(hs_width,hs_height,vout_h, count);
- hs_debug_u32(hs_width,hs_height,sorted_h,count);
- } else { // ulong
- hs_debug_u64(hs_width,hs_height,vout_h, count);
- hs_debug_u64(hs_width,hs_height,sorted_h,count);
- }
- }
+ if (!verified)
+ {
+ if (hs_words == 1) {
+ hs_debug_u32(hs_width,hs_height,vout_h, count);
+ hs_debug_u32(hs_width,hs_height,sorted_h,count);
+ } else { // ulong
+ hs_debug_u64(hs_width,hs_height,vout_h, count);
+ hs_debug_u64(hs_width,hs_height,sorted_h,count);
+ }
+ }
#endif
+ }
//
// REPORT
@@ -441,14 +445,14 @@
driver_version,
(hs_words == 1) ? "uint32_t" : "uint64_t",
linearize ? "linear" : "slab",
- verified ? " OK " : "*FAIL*",
+ verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED",
count,
count_padded_in,
count_padded_out,
// CPU
- algo,
- cpu_ns / 1000000.0, // milliseconds
- 1000.0 * count / cpu_ns, // mkeys / sec
+ verify ? cpu_algo : "UNVERIFIED",
+ verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds
+ verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec
// GPU
loops,
elapsed_ms_sum / loops, // avg msecs
@@ -458,7 +462,7 @@
(double) count / (1000.0 * elapsed_ms_min)); // mkeys / sec - max
// quit early if not verified
- if (!verified)
+ if (verify && !verified)
break;
}
@@ -564,6 +568,7 @@
uint32_t const loops = (argc <= 6) ? HS_BENCH_LOOPS : strtoul(argv[6],NULL,0);
uint32_t const warmup = (argc <= 7) ? HS_BENCH_WARMUP : strtoul(argv[7],NULL,0);
bool const linearize = (argc <= 8) ? true : strtoul(argv[8],NULL,0);
+ bool const verify = (argc <= 9) ? true : strtoul(argv[9],NULL,0);
//
// benchmark
@@ -583,7 +588,8 @@
count_step,
loops,
warmup,
- linearize);
+ linearize,
+ verify);
//
// cleanup
diff --git a/src/compute/hs/cuda/bench/sort.cpp b/src/compute/hs/cuda/bench/sort.cpp
index 624b153..1f2781d 100644
--- a/src/compute/hs/cuda/bench/sort.cpp
+++ b/src/compute/hs/cuda/bench/sort.cpp
@@ -51,6 +51,7 @@
#undef HS_USE_PARALLEL_SORT
#undef HS_USE_STD_SORT
+#undef HS_USE_QSORT
#define HS_USE_QSORT
#include <stdlib.h>
diff --git a/src/compute/hs/vk/bench/Makefile b/src/compute/hs/vk/bench/Makefile
index b3ff7a0..9771c2f 100644
--- a/src/compute/hs/vk/bench/Makefile
+++ b/src/compute/hs/vk/bench/Makefile
@@ -22,7 +22,13 @@
#
#
-VULKAN_SDK = ~/vulkan/1.1.82.0/x86_64
+GCC_OPT = -O2 -D NDEBUG
+GPP_OPT = -O2 -D NDEBUG -std=c++11
+
+#
+#
+#
+
VULKAN_INC = $(VULKAN_SDK)/include
#
@@ -33,10 +39,10 @@
g++ -o $@ $^ $(VULKAN_SDK)/lib/libvulkan.so.1
$(OBJ_C): $(SRC_C)
- gcc -D NDEBUG -c $^ -I ../../.. -I .. -I $(VULKAN_INC)
+ gcc $(GCC_OPT) -c $^ -I ../../.. -I .. -I $(VULKAN_INC)
$(OBJ_CPP): $(SRC_CPP)
- g++ -D NDEBUG -std=c++11 -c $^
+ g++ $(GPP_OPT) -c $^
.PHONY: clean