| // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include <unistd.h> |
| #ifdef __APPLE__ |
| #include <sys/time.h> |
| #endif |
| |
| #include <cstdint> |
| #include <cstdlib> |
| #include <ctime> |
| #include <iomanip> |
| #include <iostream> |
| #include <map> |
| #include <memory> |
| #include <vector> |
| |
| #include "multi_thread_transform.h" |
| #include "transform_kernels.h" |
| |
| using namespace gemmlowp::meta; |
| |
| double time() { |
| #ifdef __APPLE__ |
| timeval t; |
| gettimeofday(&t, nullptr); |
| return t.tv_sec + 1e-6 * t.tv_usec; |
| #else |
| timespec t; |
| clock_gettime(CLOCK_REALTIME, &t); |
| return t.tv_sec + 1e-9 * t.tv_nsec; |
| #endif |
| } |
| |
| #define kernel_size (16) |
| |
| template <typename Context, typename Params> |
| void run_benchmark(const std::string& name, int repetitions, int elements, |
| Context* context, const Params& params) { |
| std::cout << "Benchmark: " << name << std::endl; |
| std::cout << "Warmup single." << std::endl; |
| |
| for (int i = 0; i < 10; ++i) { |
| Transform1D<Params, kernel_size>(params); |
| } |
| |
| std::cout << "Benchmark single." << std::endl; |
| |
| double start = time(); |
| |
| for (int i = 0; i < repetitions; ++i) { |
| Transform1D<Params, kernel_size>(params); |
| } |
| |
| double wall_time = time() - start; |
| double ops = static_cast<double>(elements) * repetitions; |
| std::cout << "Avg: " << (wall_time / repetitions) << std::endl; |
| std::cout << "Perf: " << static_cast<std::int64_t>(ops / wall_time) << "/s." |
| << std::endl; |
| |
| std::cout << "Warmup single." << std::endl; |
| |
| for (int i = 0; i < 10; ++i) { |
| MultiThreadTransform1D<Context, Params, kernel_size>(context, params); |
| } |
| |
| std::cout << "Benchmark multi." << std::endl; |
| |
| start = time(); |
| |
| for (int i = 0; i < repetitions; ++i) { |
| MultiThreadTransform1D<Context, Params, kernel_size>(context, params); |
| } |
| |
| wall_time = time() - start; |
| ops = static_cast<double>(elements) * repetitions; |
| std::cout << "Avg: " << (wall_time / repetitions) << std::endl; |
| std::cout << "Perf: " << static_cast<std::int64_t>(ops / wall_time) << "/s." |
| << std::endl; |
| } |
| |
| int main() { |
| const int repetitions = 500; |
| const int elements = 4 * 1024 * 1024; |
| |
| std::unique_ptr<std::int32_t[]> int32_array(new std::int32_t[elements]); |
| std::unique_ptr<std::uint8_t[]> uint8_array(new std::uint8_t[elements]); |
| std::unique_ptr<float[]> float_array(new float[elements]); |
| |
| typedef SimpleContext<gemmlowp::WorkersPool> Context; |
| Context context(4, new gemmlowp::WorkersPool()); |
| |
| typedef Transform1DParams<std::int32_t, std::uint8_t, Requantize> RequantizeParams; |
| RequantizeParams requantize_params; |
| requantize_params.input = int32_array.get(); |
| requantize_params.output = uint8_array.get(); |
| requantize_params.kernel.count = elements; |
| requantize_params.kernel.input_range_min = -100.0f; |
| requantize_params.kernel.input_range_scale = |
| 200.0f / ((static_cast<std::int64_t>(1) << 32) - 1); |
| requantize_params.kernel.input_range_offset = |
| static_cast<float>(std::numeric_limits<std::int32_t>::lowest()); |
| requantize_params.kernel.output_range_min = -200.0f; |
| requantize_params.kernel.one_over_output_range_scale = |
| static_cast<float>((static_cast<std::int64_t>(1) << 8) - 1) / 500.0f; |
| requantize_params.kernel.output_range_offset = |
| static_cast<float>(std::numeric_limits<std::uint8_t>::lowest()); |
| |
| run_benchmark("Requantize", repetitions, elements, &context, |
| requantize_params); |
| |
| typedef Transform1DParams<std::uint8_t, float, Dequantize> DequantizeParams; |
| DequantizeParams dequantize_params; |
| dequantize_params.input = uint8_array.get(); |
| dequantize_params.output = float_array.get(); |
| dequantize_params.kernel.count = elements; |
| dequantize_params.kernel.range_min = -100.0f; |
| dequantize_params.kernel.range_scale = |
| static_cast<float>((static_cast<std::int64_t>(1) << 8) - 1) / 200.0f; |
| dequantize_params.kernel.range_offset = |
| static_cast<float>(std::numeric_limits<std::uint8_t>::lowest()); |
| |
| run_benchmark("Dequantize", repetitions, elements, &context, |
| dequantize_params); |
| |
| typedef Transform1DParams<float, std::uint8_t, Quantize> QuantizeParams; |
| QuantizeParams quantize_params; |
| quantize_params.input = float_array.get(); |
| quantize_params.output = uint8_array.get(); |
| quantize_params.kernel.count = elements; |
| quantize_params.kernel.range_min = -100.0f; |
| quantize_params.kernel.range_scale = |
| 200.0f / ((static_cast<std::int64_t>(1) << 8) - 1); |
| quantize_params.kernel.range_offset = |
| static_cast<float>(std::numeric_limits<std::uint8_t>::lowest()); |
| |
| run_benchmark("Quantize", repetitions, elements, &context, quantize_params); |
| |
| return 0; |
| } |