blob: f02adb54703928f5e4481f08595e64329341d193 [file] [log] [blame]
Frank Barcharddc909cb2021-02-08 13:59:31 -08001// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <functional>
9#include <random>
10#include <vector>
11
12#include <xnnpack.h>
13
14#include <benchmark/benchmark.h>
15
16#include "bench/end2end.h"
17#include "bench/utils.h"
18#include "models/models.h"
19#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/params.h>
22
23
24static void GEMMEnd2EndBenchmark(
25 benchmark::State& state,
26 models::ExecutionPlanFactory model_factory,
27 xnn_qs8_gemm_ukernel_function gemm,
28 xnn_qs8_igemm_ukernel_function igemm,
29 xnn_qs8_gemm_ukernel_function gemm1,
30 xnn_qs8_igemm_ukernel_function igemm1,
31 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
32 benchmark::utils::IsaCheckFunction isa_check = nullptr)
33{
34 if (isa_check && !isa_check(state)) {
35 return;
36 }
37 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
38 state.SkipWithError("failed to initialize XNNPACK");
39 return;
40 }
41
42 // Override microkernels chosen in xnn_initialize
43 // Note: do not directly assign to xnn_params.qs8.gemm because it breaks older gcc.
44 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
45 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
46 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
47 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
48 xnn_params.qs8.gemm.mr = mr;
49 xnn_params.qs8.gemm.nr = nr;
50 xnn_params.qs8.gemm.log2_kr = log2_kr;
51 xnn_params.qs8.gemm.log2_sr = log2_sr;
52
53 auto execution_plan = model_factory(nullptr);
54 if (execution_plan.empty()) {
55 state.SkipWithError("failed to create a model");
56 return;
57 }
58
59 for (auto _ : state) {
60 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
61 xnn_status status = xnn_run_operator(op.get(), nullptr);
62 if (status != xnn_status_success) {
63 state.SkipWithError("failed to run a model");
64 return;
65 }
66 }
67 }
68
69 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
70 if (cpu_frequency != 0) {
71 state.counters["cpufreq"] = cpu_frequency;
72 }
73}
74
75#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
76 static void qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
77 GEMMEnd2EndBenchmark(state, model,
78 xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64,
79 xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot,
80 xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64,
81 xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
82 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
83 benchmark::utils::CheckNEONDOT);
84 }
85
86 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64)
87#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
88
Frank Barcharddc909cb2021-02-08 13:59:31 -080089#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard15c00362021-02-08 23:21:43 -080090 static void qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
91 GEMMEnd2EndBenchmark(state, model,
92 xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
93 xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
94 xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
95 xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
96 1 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
97 benchmark::utils::CheckNEON);
98 }
99
100 static void qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
101 GEMMEnd2EndBenchmark(state, model,
102 xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
103 xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
104 xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
105 xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
106 1 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
107 benchmark::utils::CheckNEON);
108 }
109
110 static void qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
111 GEMMEnd2EndBenchmark(state, model,
112 xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane,
113 xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane,
114 xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
115 xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
116 2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
117 benchmark::utils::CheckNEON);
118 }
119
Frank Barcharddc909cb2021-02-08 13:59:31 -0800120 static void qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
121 GEMMEnd2EndBenchmark(state, model,
122 xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane,
123 xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane,
124 xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
125 xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
126 2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
127 benchmark::utils::CheckNEON);
128 }
129
Frank Barchard15c00362021-02-08 23:21:43 -0800130 static void qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
131 GEMMEnd2EndBenchmark(state, model,
132 xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane,
133 xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane,
134 xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
135 xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
136 3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
137 benchmark::utils::CheckNEON);
138 }
139
140 static void qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
141 GEMMEnd2EndBenchmark(state, model,
142 xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane,
143 xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane,
144 xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
145 xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
146 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
147 benchmark::utils::CheckNEON);
148 }
149
150 static void qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
151 GEMMEnd2EndBenchmark(state, model,
152 xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane,
153 xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane,
154 xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
155 xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
156 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
157 benchmark::utils::CheckNEON);
158 }
159
160 static void qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
161 GEMMEnd2EndBenchmark(state, model,
162 xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane,
163 xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane,
164 xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
165 xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
166 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
167 benchmark::utils::CheckNEON);
168 }
169
170 static void qs8_gemm_minmax_ukernel_1x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
171 GEMMEnd2EndBenchmark(state, model,
172 xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
173 xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
174 xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
175 xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
176 1 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
177 benchmark::utils::CheckNEONDOT);
178 }
179
180 static void qs8_gemm_minmax_ukernel_1x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
181 GEMMEnd2EndBenchmark(state, model,
182 xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
183 xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
184 xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
185 xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
186 1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
187 benchmark::utils::CheckNEONDOT);
188 }
189
190 static void qs8_gemm_minmax_ukernel_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
191 GEMMEnd2EndBenchmark(state, model,
192 xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot,
193 xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot,
194 xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
195 xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
196 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
197 benchmark::utils::CheckNEONDOT);
198 }
199
200 static void qs8_gemm_minmax_ukernel_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
201 GEMMEnd2EndBenchmark(state, model,
202 xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot,
203 xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot,
204 xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
205 xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
206 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
207 benchmark::utils::CheckNEONDOT);
208 }
209
210 static void qs8_gemm_minmax_ukernel_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
211 GEMMEnd2EndBenchmark(state, model,
212 xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot,
213 xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot,
214 xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
215 xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
216 6 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
217 benchmark::utils::CheckNEONDOT);
218 }
219
220 static void qs8_gemm_minmax_ukernel_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
221 GEMMEnd2EndBenchmark(state, model,
222 xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot,
223 xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot,
224 xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
225 xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
226 6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
227 benchmark::utils::CheckNEONDOT);
228 }
229
230 static void qs8_gemm_minmax_ukernel_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
231 GEMMEnd2EndBenchmark(state, model,
232 xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot,
233 xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot,
234 xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
235 xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
236 8 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
237 benchmark::utils::CheckNEONDOT);
238 }
239
240 static void qs8_gemm_minmax_ukernel_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
241 GEMMEnd2EndBenchmark(state, model,
242 xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot,
243 xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot,
244 xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
245 xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
246 8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
247 benchmark::utils::CheckNEONDOT);
248 }
249
250 static void qs8_gemm_minmax_ukernel_12x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
251 GEMMEnd2EndBenchmark(state, model,
252 xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot,
253 xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot,
254 xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
255 xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
256 12 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
257 benchmark::utils::CheckNEONDOT);
258 }
259
260 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane);
261 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane);
262 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane);
Frank Barcharddc909cb2021-02-08 13:59:31 -0800263 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane);
Frank Barchard15c00362021-02-08 23:21:43 -0800264 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
265 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
266 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
267 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
268 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8c4__neondot);
269 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c4__neondot);
270 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8c4__neondot);
271 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c4__neondot);
272 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_6x8c4__neondot);
273 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_6x16c4__neondot);
274 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_8x8c4__neondot);
275 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_8x16c4__neondot);
276 BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_12x8c4__neondot);
Frank Barcharddc909cb2021-02-08 13:59:31 -0800277#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
278
279#if SCALAR_IGEMM
280static void qs8_gemm_minmax_ukernel_8x8c4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
281 GEMMEnd2EndBenchmark(state, model,
282 xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar,
283 xnn_qs8_igemm_minmax_ukernel_8x8c4__scalar,
284 xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar,
285 xnn_qs8_igemm_minmax_ukernel_8x8c4__scalar,
286 8 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */);
287
288BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_8x8c4__scalar);
289#endif // SCALAR_IGEMM
290
291#ifndef XNNPACK_BENCHMARK_NO_MAIN
292BENCHMARK_MAIN();
293#endif