blob: 9661eb13c157081d463112da717f802855316728 [file] [log] [blame]
Marat Dukhan5f18d262019-10-31 10:24:14 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <functional>
9#include <random>
10#include <vector>
11
12#include <xnnpack.h>
13
14#include <benchmark/benchmark.h>
15
Marat Dukhanc08cdf52019-12-09 09:17:51 -080016#include "bench/end2end.h"
Frank Barcharde72e2872019-10-31 11:12:15 -070017#include "bench/utils.h"
Marat Dukhan5f18d262019-10-31 10:24:14 -070018#include "models/models.h"
19#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/params.h>
22
23
24static void GEMMEnd2EndBenchmark(
25 benchmark::State& state,
Frank Barcharde72e2872019-10-31 11:12:15 -070026 models::ExecutionPlanFactory model_factory,
Marat Dukhan5f18d262019-10-31 10:24:14 -070027 xnn_f32_gemm_ukernel_function gemm,
28 xnn_f32_igemm_ukernel_function igemm,
29 xnn_f32_gemm_ukernel_function gemm1,
30 xnn_f32_igemm_ukernel_function igemm1,
Marat Dukhanc8466f52019-11-25 18:01:10 -080031 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
32 benchmark::utils::IsaCheckFunction isa_check = nullptr)
Marat Dukhan5f18d262019-10-31 10:24:14 -070033{
Marat Dukhanc8466f52019-11-25 18:01:10 -080034 if (isa_check && !isa_check(state)) {
35 return;
36 }
Marat Dukhan04f03be2019-11-19 12:36:47 -080037 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
Marat Dukhan5f18d262019-10-31 10:24:14 -070038 state.SkipWithError("failed to initialize XNNPACK");
39 return;
40 }
41
42 // Override microkernels chosen in xnn_initialize
43 xnn_params.f32.gemm = (struct gemm_parameters) {
44 .gemm = xnn_gemm_ukernel_function(gemm),
45 .igemm = xnn_igemm_ukernel_function(igemm),
46 .gemm1 = xnn_gemm_ukernel_function(gemm1),
47 .igemm1 = xnn_igemm_ukernel_function(igemm1),
48 .mr = mr,
49 .nr = nr,
50 .log2_kr = log2_kr,
51 .log2_sr = log2_sr,
52 };
53
54 auto execution_plan = model_factory(nullptr);
55 if (execution_plan.empty()) {
56 state.SkipWithError("failed to create a model");
57 return;
58 }
59
60 for (auto _ : state) {
61 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
62 xnn_status status = xnn_run_operator(op.get(), nullptr);
63 if (status != xnn_status_success) {
64 state.SkipWithError("failed to run a model");
65 return;
66 }
67 }
68 }
Frank Barcharde72e2872019-10-31 11:12:15 -070069 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
Marat Dukhan5f18d262019-10-31 10:24:14 -070070}
71
72#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
73 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
74 GEMMEnd2EndBenchmark(state, model,
75 xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53,
76 xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53,
77 xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53,
78 xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53,
79 4 /* mr */, 12 /* nr */);
80 }
81
82 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
83 GEMMEnd2EndBenchmark(state, model,
84 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53,
85 xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53,
86 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
87 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
88 4 /* mr */, 8 /* nr */);
89 }
90
91 static void f32_gemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, models::ExecutionPlanFactory model) {
92 GEMMEnd2EndBenchmark(state, model,
93 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
Frank Barchard387c2d12019-12-16 19:14:07 -080094 xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
Marat Dukhan5f18d262019-10-31 10:24:14 -070095 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
96 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
97 4 /* mr */, 8 /* nr */);
98 }
99
100 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
101 GEMMEnd2EndBenchmark(state, model,
102 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
103 xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
104 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
105 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
106 4 /* mr */, 8 /* nr */);
107 }
108
109 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
110 GEMMEnd2EndBenchmark(state, model,
111 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64,
Frank Barchard91317c52019-11-22 10:54:35 -0800112 xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64,
113 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
114 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
Marat Dukhan5f18d262019-10-31 10:24:14 -0700115 4 /* mr */, 8 /* nr */);
116 }
117
118 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
119 GEMMEnd2EndBenchmark(state, model,
120 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128,
Frank Barchard91317c52019-11-22 10:54:35 -0800121 xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128,
122 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
123 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
Marat Dukhan5f18d262019-10-31 10:24:14 -0700124 4 /* mr */, 8 /* nr */);
125 }
126
Frank Barchard387c2d12019-12-16 19:14:07 -0800127 static void f32_gemm_5x8__aarch64_neonfma_cortex_a57(benchmark::State& state, models::ExecutionPlanFactory model) {
128 GEMMEnd2EndBenchmark(state, model,
129 xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57,
130 xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57,
131 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
132 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
133 5 /* mr */, 8 /* nr */);
134 }
135
Marat Dukhan5f18d262019-10-31 10:24:14 -0700136 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
137 GEMMEnd2EndBenchmark(state, model,
138 xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75,
139 xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75,
140 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
141 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
142 5 /* mr */, 8 /* nr */);
143 }
144
145 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
146 GEMMEnd2EndBenchmark(state, model,
147 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
148 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
149 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
150 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
151 6 /* mr */, 8 /* nr */);
152 }
153
Marat Dukhan5f18d262019-10-31 10:24:14 -0700154 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, models::ExecutionPlanFactory model) {
155 GEMMEnd2EndBenchmark(state, model,
156 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
157 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
158 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
159 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
160 6 /* mr */, 8 /* nr */);
161 }
162
Frank Barchard387c2d12019-12-16 19:14:07 -0800163 static void f32_gemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, models::ExecutionPlanFactory model) {
164 GEMMEnd2EndBenchmark(state, model,
165 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
166 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
167 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
168 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
169 6 /* mr */, 8 /* nr */);
170 }
171
Marat Dukhan5f18d262019-10-31 10:24:14 -0700172 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
173 GEMMEnd2EndBenchmark(state, model,
174 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
175 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
176 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
177 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
178 6 /* mr */, 8 /* nr */);
179 }
180
181 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
182 GEMMEnd2EndBenchmark(state, model,
183 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64,
Frank Barchard91317c52019-11-22 10:54:35 -0800184 xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
185 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
186 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
Marat Dukhan5f18d262019-10-31 10:24:14 -0700187 6 /* mr */, 8 /* nr */);
188 }
189
190 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
191 GEMMEnd2EndBenchmark(state, model,
192 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128,
Frank Barchard91317c52019-11-22 10:54:35 -0800193 xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
194 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
195 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
196 6 /* mr */, 8 /* nr */);
197 }
198
199 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
200 GEMMEnd2EndBenchmark(state, model,
201 xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64,
202 xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64,
203 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
204 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
205 4 /* mr */, 8 /* nr */);
206 }
207
208 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
209 GEMMEnd2EndBenchmark(state, model,
210 xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128,
211 xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128,
212 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
213 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
214 4 /* mr */, 8 /* nr */);
215 }
216
217 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
218 GEMMEnd2EndBenchmark(state, model,
219 xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
220 xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
221 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
222 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
Marat Dukhan5f18d262019-10-31 10:24:14 -0700223 6 /* mr */, 8 /* nr */);
224 }
225
Frank Barchard69172d92019-11-26 16:22:39 -0800226 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
227 GEMMEnd2EndBenchmark(state, model,
228 xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128,
229 xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128,
230 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
231 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
232 6 /* mr */, 8 /* nr */);
233 }
234
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800235 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_ld64)
236 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_ld128);
237 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_ld64);
238 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_ld128);
239 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
240 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a57)
241 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
Frank Barchard387c2d12019-12-16 19:14:07 -0800242 BENCHMARK_END2END(f32_gemm_5x8__aarch64_neonfma_cortex_a57);
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800243 BENCHMARK_END2END(f32_gemm_5x8__aarch64_neonfma_cortex_a75);
244 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a53);
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800245 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a73);
Frank Barchard387c2d12019-12-16 19:14:07 -0800246 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a57);
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800247 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a75);
248 BENCHMARK_END2END(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
Marat Dukhan5f18d262019-10-31 10:24:14 -0700249
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800250 BENCHMARK_END2END(f32_gemm_4x8__neonfma_lane_ld64);
251 BENCHMARK_END2END(f32_gemm_4x8__neonfma_lane_ld128);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700252
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800253 BENCHMARK_END2END(f32_gemm_6x8__neonfma_lane_ld64);
254 BENCHMARK_END2END(f32_gemm_6x8__neonfma_lane_ld128);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700255#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
256
Frank Barchardcab94932019-12-03 10:48:54 -0800257#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
258 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
259 GEMMEnd2EndBenchmark(state, model,
260 xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64,
Miao Wang3fa1f012020-02-17 22:45:06 +0000261 xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64,
Frank Barchardcab94932019-12-03 10:48:54 -0800262 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
263 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
264 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
265 benchmark::utils::CheckNEON);
266 }
Frank Barchard13916042019-12-11 10:56:34 -0800267 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
268 GEMMEnd2EndBenchmark(state, model,
269 xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53,
270 xnn_f32_igemm_ukernel_4x8__neon_lane_ld64,
271 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
272 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
273 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
274 benchmark::utils::CheckNEON);
275 }
Frank Barchard3e237f22019-12-04 23:08:51 -0800276 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
277 GEMMEnd2EndBenchmark(state, model,
278 xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75,
Miao Wang3fa1f012020-02-17 22:45:06 +0000279 xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75,
Frank Barchard3e237f22019-12-04 23:08:51 -0800280 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
281 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
282 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
283 benchmark::utils::CheckNEON);
284 }
Frank Barchard9f7d5552019-12-12 10:58:10 -0800285 static void f32_gemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
286 GEMMEnd2EndBenchmark(state, model,
287 xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75,
Miao Wang3fa1f012020-02-17 22:45:06 +0000288 xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75,
Frank Barchard9f7d5552019-12-12 10:58:10 -0800289 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
290 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
291 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
292 benchmark::utils::CheckNEON);
293 }
Frank Barchardcab94932019-12-03 10:48:54 -0800294
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800295 BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_ld64);
Frank Barchard13916042019-12-11 10:56:34 -0800296 BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a53);
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800297 BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a75);
Frank Barchard9f7d5552019-12-12 10:58:10 -0800298 BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_pld_cortex_a75);
Frank Barchardcab94932019-12-03 10:48:54 -0800299#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
300
301
Marat Dukhan5f18d262019-10-31 10:24:14 -0700302#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -0800303 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
Marat Dukhan5f18d262019-10-31 10:24:14 -0700304 GEMMEnd2EndBenchmark(state, model,
Frank Barchard91317c52019-11-22 10:54:35 -0800305 xnn_f32_gemm_ukernel_4x8__neon_lane_ld64,
306 xnn_f32_igemm_ukernel_4x8__neon_lane_ld64,
307 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
308 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800309 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
310 benchmark::utils::CheckNEON);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700311 }
312
Frank Barchard91317c52019-11-22 10:54:35 -0800313 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
Marat Dukhan5f18d262019-10-31 10:24:14 -0700314 GEMMEnd2EndBenchmark(state, model,
Frank Barchard91317c52019-11-22 10:54:35 -0800315 xnn_f32_gemm_ukernel_4x8__neon_lane_ld128,
316 xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
317 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
318 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800319 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
320 benchmark::utils::CheckNEON);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700321 }
322
Frank Barchard91317c52019-11-22 10:54:35 -0800323 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
Marat Dukhan5f18d262019-10-31 10:24:14 -0700324 GEMMEnd2EndBenchmark(state, model,
Frank Barchard91317c52019-11-22 10:54:35 -0800325 xnn_f32_gemm_ukernel_6x8__neon_lane_ld64,
326 xnn_f32_igemm_ukernel_6x8__neon_lane_ld64,
327 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
328 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800329 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
330 benchmark::utils::CheckNEON);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700331 }
Frank Barchard69172d92019-11-26 16:22:39 -0800332
333 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
334 GEMMEnd2EndBenchmark(state, model,
335 xnn_f32_gemm_ukernel_6x8__neon_lane_ld128,
336 xnn_f32_igemm_ukernel_6x8__neon_lane_ld128,
337 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
338 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
339 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
340 benchmark::utils::CheckNEON);
341 }
342
Frank Barchard5243bb02019-11-22 16:37:50 -0800343 static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
344 GEMMEnd2EndBenchmark(state, model,
345 xnn_f32_gemm_ukernel_4x8__neon_dup_ld64,
346 xnn_f32_igemm_ukernel_4x8__neon_dup_ld64,
347 xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
348 xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800349 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
350 benchmark::utils::CheckNEON);
Frank Barchard5243bb02019-11-22 16:37:50 -0800351 }
352
353 static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
354 GEMMEnd2EndBenchmark(state, model,
355 xnn_f32_gemm_ukernel_4x8__neon_dup_ld128,
356 xnn_f32_igemm_ukernel_4x8__neon_dup_ld128,
357 xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
358 xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800359 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
360 benchmark::utils::CheckNEON);
Frank Barchard5243bb02019-11-22 16:37:50 -0800361 }
362
363 static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
364 GEMMEnd2EndBenchmark(state, model,
365 xnn_f32_gemm_ukernel_6x8__neon_dup_ld64,
366 xnn_f32_igemm_ukernel_6x8__neon_dup_ld64,
367 xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
368 xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800369 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
370 benchmark::utils::CheckNEON);
Frank Barchard5243bb02019-11-22 16:37:50 -0800371 }
372
Frank Barchard69172d92019-11-26 16:22:39 -0800373 static void f32_gemm_6x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
374 GEMMEnd2EndBenchmark(state, model,
375 xnn_f32_gemm_ukernel_6x8__neon_dup_ld128,
376 xnn_f32_igemm_ukernel_6x8__neon_dup_ld128,
377 xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
378 xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
379 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
380 benchmark::utils::CheckNEON);
381 }
382
Frank Barchard5243bb02019-11-22 16:37:50 -0800383 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
384 GEMMEnd2EndBenchmark(state, model,
385 xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64,
386 xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64,
387 xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
388 xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800389 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
390 benchmark::utils::CheckNEONFMA);
Frank Barchard5243bb02019-11-22 16:37:50 -0800391 }
392
393 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
394 GEMMEnd2EndBenchmark(state, model,
395 xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128,
396 xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128,
397 xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
398 xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800399 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
400 benchmark::utils::CheckNEONFMA);
Frank Barchard5243bb02019-11-22 16:37:50 -0800401 }
402
403 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
404 GEMMEnd2EndBenchmark(state, model,
405 xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64,
406 xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64,
407 xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
408 xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800409 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
410 benchmark::utils::CheckNEONFMA);
Frank Barchard5243bb02019-11-22 16:37:50 -0800411 }
Marat Dukhan5f18d262019-10-31 10:24:14 -0700412
Frank Barchard69172d92019-11-26 16:22:39 -0800413 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
414 GEMMEnd2EndBenchmark(state, model,
415 xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128,
416 xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128,
417 xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
418 xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
419 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
420 benchmark::utils::CheckNEONFMA);
421 }
422
Frank Barcharddf06d802019-11-20 15:53:46 -0800423 static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
424 GEMMEnd2EndBenchmark(state, model,
425 xnn_f32_gemm_ukernel_4x8s4__neon,
426 xnn_f32_igemm_ukernel_4x8s4__neon,
427 xnn_f32_gemm_ukernel_1x8s4__neon,
428 xnn_f32_igemm_ukernel_1x8s4__neon,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800429 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
430 benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800431 }
432
433 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
434 GEMMEnd2EndBenchmark(state, model,
435 xnn_f32_gemm_ukernel_4x8s4__neonfma,
436 xnn_f32_igemm_ukernel_4x8s4__neonfma,
437 xnn_f32_gemm_ukernel_1x8s4__neonfma,
438 xnn_f32_igemm_ukernel_1x8s4__neonfma,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800439 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
440 benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800441 }
442
443 static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
444 GEMMEnd2EndBenchmark(state, model,
445 xnn_f32_gemm_ukernel_6x8s4__neon,
446 xnn_f32_igemm_ukernel_6x8s4__neon,
447 xnn_f32_gemm_ukernel_1x8s4__neon,
448 xnn_f32_igemm_ukernel_1x8s4__neon,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800449 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
450 benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800451 }
452
453 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
454 GEMMEnd2EndBenchmark(state, model,
455 xnn_f32_gemm_ukernel_6x8s4__neonfma,
456 xnn_f32_igemm_ukernel_6x8s4__neonfma,
457 xnn_f32_gemm_ukernel_1x8s4__neonfma,
458 xnn_f32_igemm_ukernel_1x8s4__neonfma,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800459 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
460 benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800461 }
462
463 static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
464 GEMMEnd2EndBenchmark(state, model,
465 xnn_f32_gemm_ukernel_8x8s4__neon,
466 xnn_f32_igemm_ukernel_8x8s4__neon,
467 xnn_f32_gemm_ukernel_1x8s4__neon,
468 xnn_f32_igemm_ukernel_1x8s4__neon,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800469 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
470 benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800471 }
472
473 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
474 GEMMEnd2EndBenchmark(state, model,
475 xnn_f32_gemm_ukernel_8x8s4__neonfma,
476 xnn_f32_igemm_ukernel_8x8s4__neonfma,
477 xnn_f32_gemm_ukernel_1x8s4__neonfma,
478 xnn_f32_igemm_ukernel_1x8s4__neonfma,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800479 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
480 benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800481 }
482
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800483 BENCHMARK_END2END(f32_gemm_4x8__neon_lane_ld64);
484 BENCHMARK_END2END(f32_gemm_4x8__neon_lane_ld128);
485 BENCHMARK_END2END(f32_gemm_6x8__neon_lane_ld64);
486 BENCHMARK_END2END(f32_gemm_6x8__neon_lane_ld128);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700487
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800488 BENCHMARK_END2END(f32_gemm_4x8__neon_dup_ld64);
489 BENCHMARK_END2END(f32_gemm_4x8__neon_dup_ld128);
490 BENCHMARK_END2END(f32_gemm_6x8__neon_dup_ld64);
491 BENCHMARK_END2END(f32_gemm_6x8__neon_dup_ld128);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700492
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800493 BENCHMARK_END2END(f32_gemm_4x8__neonfma_dup_ld64);
494 BENCHMARK_END2END(f32_gemm_4x8__neonfma_dup_ld128);
495 BENCHMARK_END2END(f32_gemm_6x8__neonfma_dup_ld64);
496 BENCHMARK_END2END(f32_gemm_6x8__neonfma_dup_ld128);
Frank Barcharddf06d802019-11-20 15:53:46 -0800497
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800498 BENCHMARK_END2END(f32_gemm_4x8s4__neon);
499 BENCHMARK_END2END(f32_gemm_6x8s4__neon);
500 BENCHMARK_END2END(f32_gemm_8x8s4__neon);
Frank Barchard69172d92019-11-26 16:22:39 -0800501
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800502 BENCHMARK_END2END(f32_gemm_4x8s4__neonfma);
503 BENCHMARK_END2END(f32_gemm_6x8s4__neonfma);
504 BENCHMARK_END2END(f32_gemm_8x8s4__neonfma);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700505#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
506
Marat Dukhan5f18d262019-10-31 10:24:14 -0700507#if XNN_ARCH_X86 || XNN_ARCH_X86_64
508 static void f32_gemm_4x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
509 GEMMEnd2EndBenchmark(state, model,
510 xnn_f32_gemm_ukernel_4x8__sse_load1,
511 xnn_f32_igemm_ukernel_4x8__sse_load1,
512 xnn_f32_gemm_ukernel_1x8__sse_load1,
513 xnn_f32_igemm_ukernel_1x8__sse_load1,
514 4 /* mr */, 8 /* nr */);
515 }
516
517 static void f32_gemm_4x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
518 GEMMEnd2EndBenchmark(state, model,
519 xnn_f32_gemm_ukernel_4x8__sse_dup,
520 xnn_f32_igemm_ukernel_4x8__sse_dup,
521 xnn_f32_gemm_ukernel_1x8__sse_dup,
522 xnn_f32_igemm_ukernel_1x8__sse_dup,
523 4 /* mr */, 8 /* nr */);
524 }
525
526 static void f32_gemm_4x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
527 GEMMEnd2EndBenchmark(state, model,
528 xnn_f32_gemm_ukernel_4x8s4__sse,
529 xnn_f32_igemm_ukernel_4x8s4__sse,
530 xnn_f32_gemm_ukernel_1x8s4__sse,
531 xnn_f32_igemm_ukernel_1x8s4__sse,
532 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
533 }
534
Marat Dukhanfda12b82019-11-21 12:27:59 -0800535 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
536 GEMMEnd2EndBenchmark(state, model,
537 xnn_f32_gemm_ukernel_4x8__avx_broadcast,
538 xnn_f32_igemm_ukernel_4x8__avx_broadcast,
539 xnn_f32_gemm_ukernel_1x8__avx_broadcast,
540 xnn_f32_igemm_ukernel_1x8__avx_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800541 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
542 benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800543 }
544
545 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
546 GEMMEnd2EndBenchmark(state, model,
547 xnn_f32_gemm_ukernel_5x8__avx_broadcast,
548 xnn_f32_igemm_ukernel_5x8__avx_broadcast,
549 xnn_f32_gemm_ukernel_1x8__avx_broadcast,
550 xnn_f32_igemm_ukernel_1x8__avx_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800551 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
552 benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800553 }
554
555 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
556 GEMMEnd2EndBenchmark(state, model,
557 xnn_f32_gemm_ukernel_6x8__avx_broadcast,
558 xnn_f32_igemm_ukernel_6x8__avx_broadcast,
559 xnn_f32_gemm_ukernel_1x8__avx_broadcast,
560 xnn_f32_igemm_ukernel_1x8__avx_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800561 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
562 benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800563 }
564
565 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
566 GEMMEnd2EndBenchmark(state, model,
567 xnn_f32_gemm_ukernel_7x8__avx_broadcast,
568 xnn_f32_igemm_ukernel_7x8__avx_broadcast,
569 xnn_f32_gemm_ukernel_1x8__avx_broadcast,
570 xnn_f32_igemm_ukernel_1x8__avx_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800571 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
572 benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800573 }
574
Marat Dukhaneccfd712019-12-08 16:49:27 -0800575 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
576 GEMMEnd2EndBenchmark(state, model,
577 xnn_f32_gemm_ukernel_3x16__avx_broadcast,
578 xnn_f32_igemm_ukernel_3x16__avx_broadcast,
579 xnn_f32_gemm_ukernel_1x16__avx_broadcast,
580 xnn_f32_igemm_ukernel_1x16__avx_broadcast,
581 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
582 benchmark::utils::CheckAVX);
583 }
584
585 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
586 GEMMEnd2EndBenchmark(state, model,
587 xnn_f32_gemm_ukernel_4x16__avx_broadcast,
588 xnn_f32_igemm_ukernel_4x16__avx_broadcast,
589 xnn_f32_gemm_ukernel_1x16__avx_broadcast,
590 xnn_f32_igemm_ukernel_1x16__avx_broadcast,
591 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
592 benchmark::utils::CheckAVX);
593 }
594
595 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
596 GEMMEnd2EndBenchmark(state, model,
597 xnn_f32_gemm_ukernel_5x16__avx_broadcast,
598 xnn_f32_igemm_ukernel_5x16__avx_broadcast,
599 xnn_f32_gemm_ukernel_1x16__avx_broadcast,
600 xnn_f32_igemm_ukernel_1x16__avx_broadcast,
601 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
602 benchmark::utils::CheckAVX);
603 }
604
Marat Dukhanfda12b82019-11-21 12:27:59 -0800605 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
606 GEMMEnd2EndBenchmark(state, model,
607 xnn_f32_gemm_ukernel_4x8__fma3_broadcast,
608 xnn_f32_igemm_ukernel_4x8__fma3_broadcast,
609 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
610 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800611 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
612 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800613 }
614
615 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
616 GEMMEnd2EndBenchmark(state, model,
617 xnn_f32_gemm_ukernel_5x8__fma3_broadcast,
618 xnn_f32_igemm_ukernel_5x8__fma3_broadcast,
619 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
620 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800621 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
622 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800623 }
624
625 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
626 GEMMEnd2EndBenchmark(state, model,
627 xnn_f32_gemm_ukernel_6x8__fma3_broadcast,
628 xnn_f32_igemm_ukernel_6x8__fma3_broadcast,
629 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
630 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800631 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
632 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800633 }
634
635 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
636 GEMMEnd2EndBenchmark(state, model,
637 xnn_f32_gemm_ukernel_7x8__fma3_broadcast,
638 xnn_f32_igemm_ukernel_7x8__fma3_broadcast,
639 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
640 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800641 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
642 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800643 }
644
645 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
646 GEMMEnd2EndBenchmark(state, model,
647 xnn_f32_gemm_ukernel_8x8__fma3_broadcast,
648 xnn_f32_igemm_ukernel_8x8__fma3_broadcast,
649 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
650 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800651 8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
652 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800653 }
654
Marat Dukhaneccfd712019-12-08 16:49:27 -0800655 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
656 GEMMEnd2EndBenchmark(state, model,
657 xnn_f32_gemm_ukernel_3x16__fma3_broadcast,
658 xnn_f32_igemm_ukernel_3x16__fma3_broadcast,
659 xnn_f32_gemm_ukernel_1x16__fma3_broadcast,
660 xnn_f32_igemm_ukernel_1x16__fma3_broadcast,
661 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
662 benchmark::utils::CheckFMA3);
663 }
664
665 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
666 GEMMEnd2EndBenchmark(state, model,
667 xnn_f32_gemm_ukernel_4x16__fma3_broadcast,
668 xnn_f32_igemm_ukernel_4x16__fma3_broadcast,
669 xnn_f32_gemm_ukernel_1x16__fma3_broadcast,
670 xnn_f32_igemm_ukernel_1x16__fma3_broadcast,
671 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
672 benchmark::utils::CheckFMA3);
673 }
674
675 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
676 GEMMEnd2EndBenchmark(state, model,
677 xnn_f32_gemm_ukernel_5x16__fma3_broadcast,
678 xnn_f32_igemm_ukernel_5x16__fma3_broadcast,
679 xnn_f32_gemm_ukernel_1x16__fma3_broadcast,
680 xnn_f32_igemm_ukernel_1x16__fma3_broadcast,
681 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
682 benchmark::utils::CheckFMA3);
683 }
684
Marat Dukhan27121322019-12-09 14:57:40 -0800685 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
686 GEMMEnd2EndBenchmark(state, model,
687 xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast,
688 xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast,
689 xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast,
690 xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast,
691 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
692 benchmark::utils::CheckFMA3);
693 }
694
695 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
696 GEMMEnd2EndBenchmark(state, model,
697 xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast,
698 xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast,
699 xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast,
700 xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast,
701 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
702 benchmark::utils::CheckFMA3);
703 }
704
705 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
706 GEMMEnd2EndBenchmark(state, model,
707 xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast,
708 xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast,
709 xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast,
710 xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast,
711 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
712 benchmark::utils::CheckFMA3);
713 }
714
Marat Dukhan0f349c42019-11-27 11:58:54 -0800715 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
716 GEMMEnd2EndBenchmark(state, model,
717 xnn_f32_gemm_ukernel_4x16__avx512f_broadcast,
718 xnn_f32_igemm_ukernel_4x16__avx512f_broadcast,
719 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
720 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
721 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
722 benchmark::utils::CheckAVX512F);
723 }
724
725 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
726 GEMMEnd2EndBenchmark(state, model,
727 xnn_f32_gemm_ukernel_5x16__avx512f_broadcast,
728 xnn_f32_igemm_ukernel_5x16__avx512f_broadcast,
729 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
730 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
731 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
732 benchmark::utils::CheckAVX512F);
733 }
734
735 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
736 GEMMEnd2EndBenchmark(state, model,
737 xnn_f32_gemm_ukernel_6x16__avx512f_broadcast,
738 xnn_f32_igemm_ukernel_6x16__avx512f_broadcast,
739 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
740 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
741 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
742 benchmark::utils::CheckAVX512F);
743 }
744
745 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
746 GEMMEnd2EndBenchmark(state, model,
747 xnn_f32_gemm_ukernel_7x16__avx512f_broadcast,
748 xnn_f32_igemm_ukernel_7x16__avx512f_broadcast,
749 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
750 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
751 7 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
752 benchmark::utils::CheckAVX512F);
753 }
754
755 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
756 GEMMEnd2EndBenchmark(state, model,
757 xnn_f32_gemm_ukernel_8x16__avx512f_broadcast,
758 xnn_f32_igemm_ukernel_8x16__avx512f_broadcast,
759 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
760 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
761 8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
762 benchmark::utils::CheckAVX512F);
763 }
764
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800765 BENCHMARK_END2END(f32_gemm_4x8__sse_load1);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700766
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800767 BENCHMARK_END2END(f32_gemm_4x8__sse_dup);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700768
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800769 BENCHMARK_END2END(f32_gemm_4x8s4__sse);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800770
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800771 BENCHMARK_END2END(f32_gemm_4x8__avx_broadcast);
772 BENCHMARK_END2END(f32_gemm_5x8__avx_broadcast);
773 BENCHMARK_END2END(f32_gemm_6x8__avx_broadcast);
774 BENCHMARK_END2END(f32_gemm_7x8__avx_broadcast);
775 BENCHMARK_END2END(f32_gemm_3x16__avx_broadcast);
776 BENCHMARK_END2END(f32_gemm_4x16__avx_broadcast);
777 BENCHMARK_END2END(f32_gemm_5x16__avx_broadcast);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800778
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800779 BENCHMARK_END2END(f32_gemm_4x8__fma3_broadcast);
780 BENCHMARK_END2END(f32_gemm_5x8__fma3_broadcast);
781 BENCHMARK_END2END(f32_gemm_6x8__fma3_broadcast);
782 BENCHMARK_END2END(f32_gemm_7x8__fma3_broadcast);
783 BENCHMARK_END2END(f32_gemm_8x8__fma3_broadcast);
784 BENCHMARK_END2END(f32_gemm_3x16__fma3_broadcast);
785 BENCHMARK_END2END(f32_gemm_4x16__fma3_broadcast);
786 BENCHMARK_END2END(f32_gemm_5x16__fma3_broadcast);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800787
Marat Dukhan27121322019-12-09 14:57:40 -0800788 BENCHMARK_END2END(f32_gemm_3x16s4__fma3_broadcast);
789 BENCHMARK_END2END(f32_gemm_4x16s4__fma3_broadcast);
790 BENCHMARK_END2END(f32_gemm_5x16s4__fma3_broadcast);
791
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800792 BENCHMARK_END2END(f32_gemm_4x16__avx512f_broadcast);
793 BENCHMARK_END2END(f32_gemm_5x16__avx512f_broadcast);
794 BENCHMARK_END2END(f32_gemm_6x16__avx512f_broadcast);
795 BENCHMARK_END2END(f32_gemm_7x16__avx512f_broadcast);
796 BENCHMARK_END2END(f32_gemm_8x16__avx512f_broadcast);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700797#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
798
799#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
800 static void f32_gemm_4x8__psimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
801 GEMMEnd2EndBenchmark(state, model,
802 xnn_f32_gemm_ukernel_4x8__psimd_loadsplat,
803 xnn_f32_igemm_ukernel_4x8__psimd_loadsplat,
804 xnn_f32_gemm_ukernel_1x8__psimd_loadsplat,
805 xnn_f32_igemm_ukernel_1x8__psimd_loadsplat,
806 4 /* mr */, 8 /* nr */);
807 }
808
809 static void f32_gemm_6x8__psimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
810 GEMMEnd2EndBenchmark(state, model,
811 xnn_f32_gemm_ukernel_6x8__psimd_loadsplat,
812 xnn_f32_igemm_ukernel_6x8__psimd_loadsplat,
813 xnn_f32_gemm_ukernel_1x8__psimd_loadsplat,
814 xnn_f32_igemm_ukernel_1x8__psimd_loadsplat,
815 6 /* mr */, 8 /* nr */);
816 }
817
818 static void f32_gemm_4x8__psimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
819 GEMMEnd2EndBenchmark(state, model,
820 xnn_f32_gemm_ukernel_4x8__psimd_splat,
821 xnn_f32_igemm_ukernel_4x8__psimd_splat,
822 xnn_f32_gemm_ukernel_1x8__psimd_splat,
823 xnn_f32_igemm_ukernel_1x8__psimd_splat,
824 4 /* mr */, 8 /* nr */);
825 }
826
827 static void f32_gemm_6x8__psimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
828 GEMMEnd2EndBenchmark(state, model,
829 xnn_f32_gemm_ukernel_6x8__psimd_splat,
830 xnn_f32_igemm_ukernel_6x8__psimd_splat,
831 xnn_f32_gemm_ukernel_1x8__psimd_splat,
832 xnn_f32_igemm_ukernel_1x8__psimd_splat,
833 6 /* mr */, 8 /* nr */);
834 }
835
836 static void f32_gemm_4x8s4__psimd(benchmark::State& state, models::ExecutionPlanFactory model) {
837 GEMMEnd2EndBenchmark(state, model,
838 xnn_f32_gemm_ukernel_4x8s4__psimd,
839 xnn_f32_igemm_ukernel_4x8s4__psimd,
840 xnn_f32_gemm_ukernel_1x8s4__psimd,
841 xnn_f32_igemm_ukernel_1x8s4__psimd,
842 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
843 }
844
845 static void f32_gemm_6x8s4__psimd(benchmark::State& state, models::ExecutionPlanFactory model) {
846 GEMMEnd2EndBenchmark(state, model,
847 xnn_f32_gemm_ukernel_6x8s4__psimd,
848 xnn_f32_igemm_ukernel_6x8s4__psimd,
849 xnn_f32_gemm_ukernel_1x8s4__psimd,
850 xnn_f32_igemm_ukernel_1x8s4__psimd,
851 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
852 }
853
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800854 BENCHMARK_END2END(f32_gemm_4x8__psimd_loadsplat);
855 BENCHMARK_END2END(f32_gemm_6x8__psimd_loadsplat);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700856
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800857 BENCHMARK_END2END(f32_gemm_4x8__psimd_splat);
858 BENCHMARK_END2END(f32_gemm_6x8__psimd_splat);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700859
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800860 BENCHMARK_END2END(f32_gemm_4x8s4__psimd);
861 BENCHMARK_END2END(f32_gemm_6x8s4__psimd);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700862#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
863
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800864#if XNN_ARCH_WASM
865 static void f32_gemm_2x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
866 GEMMEnd2EndBenchmark(state, model,
867 xnn_f32_gemm_ukernel_2x4__wasm,
868 xnn_f32_igemm_ukernel_2x4__wasm,
869 xnn_f32_gemm_ukernel_1x4__wasm,
870 xnn_f32_igemm_ukernel_1x4__wasm,
871 2 /* mr */, 4 /* nr */);
872 }
873
874 static void f32_gemm_4x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
875 GEMMEnd2EndBenchmark(state, model,
876 xnn_f32_gemm_ukernel_4x4__wasm,
877 xnn_f32_igemm_ukernel_4x4__wasm,
878 xnn_f32_gemm_ukernel_1x4__wasm,
879 xnn_f32_igemm_ukernel_1x4__wasm,
880 4 /* mr */, 4 /* nr */);
881 }
882
883 BENCHMARK_END2END(f32_gemm_2x4__wasm);
884 BENCHMARK_END2END(f32_gemm_4x4__wasm);
885#endif // XNN_ARCH_WASM
886
Marat Dukhan5f18d262019-10-31 10:24:14 -0700887static void f32_gemm_2x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
888 GEMMEnd2EndBenchmark(state, model,
889 xnn_f32_gemm_ukernel_2x4__scalar,
890 xnn_f32_igemm_ukernel_2x4__scalar,
891 xnn_f32_gemm_ukernel_1x4__scalar,
892 xnn_f32_igemm_ukernel_1x4__scalar,
893 2 /* mr */, 4 /* nr */);
894}
895
896static void f32_gemm_4x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
897 GEMMEnd2EndBenchmark(state, model,
898 xnn_f32_gemm_ukernel_4x4__scalar,
899 xnn_f32_igemm_ukernel_4x4__scalar,
900 xnn_f32_gemm_ukernel_1x4__scalar,
901 xnn_f32_igemm_ukernel_1x4__scalar,
902 4 /* mr */, 4 /* nr */);
903}
904
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800905BENCHMARK_END2END(f32_gemm_2x4__scalar);
906BENCHMARK_END2END(f32_gemm_4x4__scalar);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700907
908#ifndef XNNPACK_BENCHMARK_NO_MAIN
909BENCHMARK_MAIN();
910#endif