blob: 2523fda3cf8b4737bc58f342b16573bcfa341d09 [file] [log] [blame]
Marat Dukhan5f18d262019-10-31 10:24:14 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <functional>
9#include <random>
10#include <vector>
11
12#include <xnnpack.h>
13
14#include <benchmark/benchmark.h>
15
Marat Dukhanc08cdf52019-12-09 09:17:51 -080016#include "bench/end2end.h"
Frank Barcharde72e2872019-10-31 11:12:15 -070017#include "bench/utils.h"
Marat Dukhan5f18d262019-10-31 10:24:14 -070018#include "models/models.h"
19#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/params.h>
22
23
24static void GEMMEnd2EndBenchmark(
25 benchmark::State& state,
Frank Barcharde72e2872019-10-31 11:12:15 -070026 models::ExecutionPlanFactory model_factory,
Marat Dukhan5f18d262019-10-31 10:24:14 -070027 xnn_f32_gemm_ukernel_function gemm,
28 xnn_f32_igemm_ukernel_function igemm,
29 xnn_f32_gemm_ukernel_function gemm1,
30 xnn_f32_igemm_ukernel_function igemm1,
Marat Dukhanc8466f52019-11-25 18:01:10 -080031 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
32 benchmark::utils::IsaCheckFunction isa_check = nullptr)
Marat Dukhan5f18d262019-10-31 10:24:14 -070033{
Marat Dukhanc8466f52019-11-25 18:01:10 -080034 if (isa_check && !isa_check(state)) {
35 return;
36 }
Marat Dukhan04f03be2019-11-19 12:36:47 -080037 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
Marat Dukhan5f18d262019-10-31 10:24:14 -070038 state.SkipWithError("failed to initialize XNNPACK");
39 return;
40 }
41
42 // Override microkernels chosen in xnn_initialize
43 xnn_params.f32.gemm = (struct gemm_parameters) {
44 .gemm = xnn_gemm_ukernel_function(gemm),
45 .igemm = xnn_igemm_ukernel_function(igemm),
46 .gemm1 = xnn_gemm_ukernel_function(gemm1),
47 .igemm1 = xnn_igemm_ukernel_function(igemm1),
48 .mr = mr,
49 .nr = nr,
50 .log2_kr = log2_kr,
51 .log2_sr = log2_sr,
52 };
53
54 auto execution_plan = model_factory(nullptr);
55 if (execution_plan.empty()) {
56 state.SkipWithError("failed to create a model");
57 return;
58 }
59
60 for (auto _ : state) {
61 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
62 xnn_status status = xnn_run_operator(op.get(), nullptr);
63 if (status != xnn_status_success) {
64 state.SkipWithError("failed to run a model");
65 return;
66 }
67 }
68 }
Frank Barcharde72e2872019-10-31 11:12:15 -070069 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
Marat Dukhan5f18d262019-10-31 10:24:14 -070070}
71
72#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
73 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
74 GEMMEnd2EndBenchmark(state, model,
75 xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53,
76 xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53,
77 xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53,
78 xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53,
79 4 /* mr */, 12 /* nr */);
80 }
81
82 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
83 GEMMEnd2EndBenchmark(state, model,
84 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53,
85 xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53,
86 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
87 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
88 4 /* mr */, 8 /* nr */);
89 }
90
91 static void f32_gemm_4x8__aarch64_neonfma_cortex_a57(benchmark::State& state, models::ExecutionPlanFactory model) {
92 GEMMEnd2EndBenchmark(state, model,
93 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
Frank Barchard387c2d12019-12-16 19:14:07 -080094 xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
Marat Dukhan5f18d262019-10-31 10:24:14 -070095 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
96 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
97 4 /* mr */, 8 /* nr */);
98 }
99
100 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
101 GEMMEnd2EndBenchmark(state, model,
102 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
103 xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
104 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
105 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
106 4 /* mr */, 8 /* nr */);
107 }
108
109 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
110 GEMMEnd2EndBenchmark(state, model,
111 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64,
Frank Barchard91317c52019-11-22 10:54:35 -0800112 xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64,
113 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
114 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
Marat Dukhan5f18d262019-10-31 10:24:14 -0700115 4 /* mr */, 8 /* nr */);
116 }
117
118 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
119 GEMMEnd2EndBenchmark(state, model,
120 xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128,
Frank Barchard91317c52019-11-22 10:54:35 -0800121 xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128,
122 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
123 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
Marat Dukhan5f18d262019-10-31 10:24:14 -0700124 4 /* mr */, 8 /* nr */);
125 }
126
Frank Barchard387c2d12019-12-16 19:14:07 -0800127 static void f32_gemm_5x8__aarch64_neonfma_cortex_a57(benchmark::State& state, models::ExecutionPlanFactory model) {
128 GEMMEnd2EndBenchmark(state, model,
129 xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57,
130 xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57,
131 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
132 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
133 5 /* mr */, 8 /* nr */);
134 }
135
Marat Dukhan5f18d262019-10-31 10:24:14 -0700136 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
137 GEMMEnd2EndBenchmark(state, model,
138 xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75,
139 xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75,
140 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
141 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
142 5 /* mr */, 8 /* nr */);
143 }
144
145 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
146 GEMMEnd2EndBenchmark(state, model,
147 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
148 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
149 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
150 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
151 6 /* mr */, 8 /* nr */);
152 }
153
Frank Barchard91e19992020-03-09 18:46:14 -0700154 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
155 GEMMEnd2EndBenchmark(state, model,
156 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55,
157 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55,
158 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
159 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
160 6 /* mr */, 8 /* nr */);
161 }
162
Marat Dukhan5f18d262019-10-31 10:24:14 -0700163 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, models::ExecutionPlanFactory model) {
164 GEMMEnd2EndBenchmark(state, model,
165 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
166 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
167 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
168 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
169 6 /* mr */, 8 /* nr */);
170 }
171
Frank Barchard387c2d12019-12-16 19:14:07 -0800172 static void f32_gemm_6x8__aarch64_neonfma_cortex_a57(benchmark::State& state, models::ExecutionPlanFactory model) {
173 GEMMEnd2EndBenchmark(state, model,
174 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
175 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
176 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
177 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
178 6 /* mr */, 8 /* nr */);
179 }
180
Marat Dukhan5f18d262019-10-31 10:24:14 -0700181 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
182 GEMMEnd2EndBenchmark(state, model,
183 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
184 xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
185 xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
186 xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
187 6 /* mr */, 8 /* nr */);
188 }
189
190 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
191 GEMMEnd2EndBenchmark(state, model,
192 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64,
Frank Barchard91317c52019-11-22 10:54:35 -0800193 xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
194 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
195 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
Marat Dukhan5f18d262019-10-31 10:24:14 -0700196 6 /* mr */, 8 /* nr */);
197 }
198
199 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
200 GEMMEnd2EndBenchmark(state, model,
201 xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128,
Frank Barchard91317c52019-11-22 10:54:35 -0800202 xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
203 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
204 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
205 6 /* mr */, 8 /* nr */);
206 }
207
208 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
209 GEMMEnd2EndBenchmark(state, model,
210 xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64,
211 xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64,
212 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
213 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
214 4 /* mr */, 8 /* nr */);
215 }
216
217 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
218 GEMMEnd2EndBenchmark(state, model,
219 xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128,
220 xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128,
221 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
222 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
223 4 /* mr */, 8 /* nr */);
224 }
225
226 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
227 GEMMEnd2EndBenchmark(state, model,
228 xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
229 xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
230 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
231 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
Marat Dukhan5f18d262019-10-31 10:24:14 -0700232 6 /* mr */, 8 /* nr */);
233 }
234
Frank Barchard69172d92019-11-26 16:22:39 -0800235 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
236 GEMMEnd2EndBenchmark(state, model,
237 xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128,
238 xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128,
239 xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
240 xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
241 6 /* mr */, 8 /* nr */);
242 }
243
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800244 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_ld64)
245 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_ld128);
246 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_ld64);
247 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_ld128);
248 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
249 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a57)
250 BENCHMARK_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
Frank Barchard387c2d12019-12-16 19:14:07 -0800251 BENCHMARK_END2END(f32_gemm_5x8__aarch64_neonfma_cortex_a57);
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800252 BENCHMARK_END2END(f32_gemm_5x8__aarch64_neonfma_cortex_a75);
253 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a53);
Frank Barchard91e19992020-03-09 18:46:14 -0700254 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a55);
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800255 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a73);
Frank Barchard387c2d12019-12-16 19:14:07 -0800256 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a57);
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800257 BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a75);
258 BENCHMARK_END2END(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
Marat Dukhan5f18d262019-10-31 10:24:14 -0700259
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800260 BENCHMARK_END2END(f32_gemm_4x8__neonfma_lane_ld64);
261 BENCHMARK_END2END(f32_gemm_4x8__neonfma_lane_ld128);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700262
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800263 BENCHMARK_END2END(f32_gemm_6x8__neonfma_lane_ld64);
264 BENCHMARK_END2END(f32_gemm_6x8__neonfma_lane_ld128);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700265#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
266
Frank Barchardcab94932019-12-03 10:48:54 -0800267#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
268 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
269 GEMMEnd2EndBenchmark(state, model,
270 xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64,
Frank Barcharddc38f072020-02-10 13:21:42 -0800271 xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64,
Frank Barchardcab94932019-12-03 10:48:54 -0800272 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
273 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
274 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
275 benchmark::utils::CheckNEON);
276 }
Frank Barchard13916042019-12-11 10:56:34 -0800277 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
278 GEMMEnd2EndBenchmark(state, model,
279 xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53,
Frank Barchardc87a8fd2020-02-12 13:02:52 -0800280 xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53,
Frank Barchard13916042019-12-11 10:56:34 -0800281 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
282 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
283 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
284 benchmark::utils::CheckNEON);
285 }
Frank Barchard3e237f22019-12-04 23:08:51 -0800286 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
287 GEMMEnd2EndBenchmark(state, model,
288 xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75,
Frank Barchard90ce7892020-02-10 23:35:45 -0800289 xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75,
Frank Barchard3e237f22019-12-04 23:08:51 -0800290 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
291 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
292 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
293 benchmark::utils::CheckNEON);
294 }
Frank Barchard9f7d5552019-12-12 10:58:10 -0800295 static void f32_gemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
296 GEMMEnd2EndBenchmark(state, model,
297 xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75,
Frank Barchard90ce7892020-02-10 23:35:45 -0800298 xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75,
Frank Barchard9f7d5552019-12-12 10:58:10 -0800299 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
300 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
301 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
302 benchmark::utils::CheckNEON);
303 }
Frank Barchardcab94932019-12-03 10:48:54 -0800304
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800305 BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_ld64);
Frank Barchard13916042019-12-11 10:56:34 -0800306 BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a53);
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800307 BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a75);
Frank Barchard9f7d5552019-12-12 10:58:10 -0800308 BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_pld_cortex_a75);
Frank Barchardcab94932019-12-03 10:48:54 -0800309#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
310
311
Marat Dukhan5f18d262019-10-31 10:24:14 -0700312#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -0800313 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
Marat Dukhan5f18d262019-10-31 10:24:14 -0700314 GEMMEnd2EndBenchmark(state, model,
Frank Barchard91317c52019-11-22 10:54:35 -0800315 xnn_f32_gemm_ukernel_4x8__neon_lane_ld64,
316 xnn_f32_igemm_ukernel_4x8__neon_lane_ld64,
317 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
318 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800319 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
320 benchmark::utils::CheckNEON);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700321 }
322
Frank Barchard91317c52019-11-22 10:54:35 -0800323 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
Marat Dukhan5f18d262019-10-31 10:24:14 -0700324 GEMMEnd2EndBenchmark(state, model,
Frank Barchard91317c52019-11-22 10:54:35 -0800325 xnn_f32_gemm_ukernel_4x8__neon_lane_ld128,
326 xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
327 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
328 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800329 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
330 benchmark::utils::CheckNEON);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700331 }
332
Frank Barchard91317c52019-11-22 10:54:35 -0800333 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
Marat Dukhan5f18d262019-10-31 10:24:14 -0700334 GEMMEnd2EndBenchmark(state, model,
Frank Barchard91317c52019-11-22 10:54:35 -0800335 xnn_f32_gemm_ukernel_6x8__neon_lane_ld64,
336 xnn_f32_igemm_ukernel_6x8__neon_lane_ld64,
337 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
338 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800339 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
340 benchmark::utils::CheckNEON);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700341 }
Frank Barchard69172d92019-11-26 16:22:39 -0800342
343 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
344 GEMMEnd2EndBenchmark(state, model,
345 xnn_f32_gemm_ukernel_6x8__neon_lane_ld128,
346 xnn_f32_igemm_ukernel_6x8__neon_lane_ld128,
347 xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
348 xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
349 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
350 benchmark::utils::CheckNEON);
351 }
352
Frank Barchard5243bb02019-11-22 16:37:50 -0800353 static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
354 GEMMEnd2EndBenchmark(state, model,
355 xnn_f32_gemm_ukernel_4x8__neon_dup_ld64,
356 xnn_f32_igemm_ukernel_4x8__neon_dup_ld64,
357 xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
358 xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800359 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
360 benchmark::utils::CheckNEON);
Frank Barchard5243bb02019-11-22 16:37:50 -0800361 }
362
363 static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
364 GEMMEnd2EndBenchmark(state, model,
365 xnn_f32_gemm_ukernel_4x8__neon_dup_ld128,
366 xnn_f32_igemm_ukernel_4x8__neon_dup_ld128,
367 xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
368 xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800369 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
370 benchmark::utils::CheckNEON);
Frank Barchard5243bb02019-11-22 16:37:50 -0800371 }
372
373 static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
374 GEMMEnd2EndBenchmark(state, model,
375 xnn_f32_gemm_ukernel_6x8__neon_dup_ld64,
376 xnn_f32_igemm_ukernel_6x8__neon_dup_ld64,
377 xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
378 xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800379 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
380 benchmark::utils::CheckNEON);
Frank Barchard5243bb02019-11-22 16:37:50 -0800381 }
382
Frank Barchard69172d92019-11-26 16:22:39 -0800383 static void f32_gemm_6x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
384 GEMMEnd2EndBenchmark(state, model,
385 xnn_f32_gemm_ukernel_6x8__neon_dup_ld128,
386 xnn_f32_igemm_ukernel_6x8__neon_dup_ld128,
387 xnn_f32_gemm_ukernel_1x8__neon_dup_ld64,
388 xnn_f32_igemm_ukernel_1x8__neon_dup_ld64,
389 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
390 benchmark::utils::CheckNEON);
391 }
392
Frank Barchard5243bb02019-11-22 16:37:50 -0800393 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
394 GEMMEnd2EndBenchmark(state, model,
395 xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64,
396 xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64,
397 xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
398 xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800399 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
400 benchmark::utils::CheckNEONFMA);
Frank Barchard5243bb02019-11-22 16:37:50 -0800401 }
402
403 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
404 GEMMEnd2EndBenchmark(state, model,
405 xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128,
406 xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128,
407 xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
408 xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800409 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
410 benchmark::utils::CheckNEONFMA);
Frank Barchard5243bb02019-11-22 16:37:50 -0800411 }
412
413 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
414 GEMMEnd2EndBenchmark(state, model,
415 xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64,
416 xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64,
417 xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
418 xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800419 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
420 benchmark::utils::CheckNEONFMA);
Frank Barchard5243bb02019-11-22 16:37:50 -0800421 }
Marat Dukhan5f18d262019-10-31 10:24:14 -0700422
Frank Barchard69172d92019-11-26 16:22:39 -0800423 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
424 GEMMEnd2EndBenchmark(state, model,
425 xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128,
426 xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128,
427 xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64,
428 xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64,
429 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
430 benchmark::utils::CheckNEONFMA);
431 }
432
Frank Barcharddf06d802019-11-20 15:53:46 -0800433 static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
434 GEMMEnd2EndBenchmark(state, model,
435 xnn_f32_gemm_ukernel_4x8s4__neon,
436 xnn_f32_igemm_ukernel_4x8s4__neon,
437 xnn_f32_gemm_ukernel_1x8s4__neon,
438 xnn_f32_igemm_ukernel_1x8s4__neon,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800439 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
440 benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800441 }
442
443 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
444 GEMMEnd2EndBenchmark(state, model,
445 xnn_f32_gemm_ukernel_4x8s4__neonfma,
446 xnn_f32_igemm_ukernel_4x8s4__neonfma,
447 xnn_f32_gemm_ukernel_1x8s4__neonfma,
448 xnn_f32_igemm_ukernel_1x8s4__neonfma,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800449 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
450 benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800451 }
452
453 static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
454 GEMMEnd2EndBenchmark(state, model,
455 xnn_f32_gemm_ukernel_6x8s4__neon,
456 xnn_f32_igemm_ukernel_6x8s4__neon,
457 xnn_f32_gemm_ukernel_1x8s4__neon,
458 xnn_f32_igemm_ukernel_1x8s4__neon,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800459 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
460 benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800461 }
462
463 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
464 GEMMEnd2EndBenchmark(state, model,
465 xnn_f32_gemm_ukernel_6x8s4__neonfma,
466 xnn_f32_igemm_ukernel_6x8s4__neonfma,
467 xnn_f32_gemm_ukernel_1x8s4__neonfma,
468 xnn_f32_igemm_ukernel_1x8s4__neonfma,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800469 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
470 benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800471 }
472
473 static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
474 GEMMEnd2EndBenchmark(state, model,
475 xnn_f32_gemm_ukernel_8x8s4__neon,
476 xnn_f32_igemm_ukernel_8x8s4__neon,
477 xnn_f32_gemm_ukernel_1x8s4__neon,
478 xnn_f32_igemm_ukernel_1x8s4__neon,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800479 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
480 benchmark::utils::CheckNEON);
Frank Barcharddf06d802019-11-20 15:53:46 -0800481 }
482
483 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
484 GEMMEnd2EndBenchmark(state, model,
485 xnn_f32_gemm_ukernel_8x8s4__neonfma,
486 xnn_f32_igemm_ukernel_8x8s4__neonfma,
487 xnn_f32_gemm_ukernel_1x8s4__neonfma,
488 xnn_f32_igemm_ukernel_1x8s4__neonfma,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800489 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
490 benchmark::utils::CheckNEONFMA);
Frank Barcharddf06d802019-11-20 15:53:46 -0800491 }
492
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800493 BENCHMARK_END2END(f32_gemm_4x8__neon_lane_ld64);
494 BENCHMARK_END2END(f32_gemm_4x8__neon_lane_ld128);
495 BENCHMARK_END2END(f32_gemm_6x8__neon_lane_ld64);
496 BENCHMARK_END2END(f32_gemm_6x8__neon_lane_ld128);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700497
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800498 BENCHMARK_END2END(f32_gemm_4x8__neon_dup_ld64);
499 BENCHMARK_END2END(f32_gemm_4x8__neon_dup_ld128);
500 BENCHMARK_END2END(f32_gemm_6x8__neon_dup_ld64);
501 BENCHMARK_END2END(f32_gemm_6x8__neon_dup_ld128);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700502
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800503 BENCHMARK_END2END(f32_gemm_4x8__neonfma_dup_ld64);
504 BENCHMARK_END2END(f32_gemm_4x8__neonfma_dup_ld128);
505 BENCHMARK_END2END(f32_gemm_6x8__neonfma_dup_ld64);
506 BENCHMARK_END2END(f32_gemm_6x8__neonfma_dup_ld128);
Frank Barcharddf06d802019-11-20 15:53:46 -0800507
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800508 BENCHMARK_END2END(f32_gemm_4x8s4__neon);
509 BENCHMARK_END2END(f32_gemm_6x8s4__neon);
510 BENCHMARK_END2END(f32_gemm_8x8s4__neon);
Frank Barchard69172d92019-11-26 16:22:39 -0800511
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800512 BENCHMARK_END2END(f32_gemm_4x8s4__neonfma);
513 BENCHMARK_END2END(f32_gemm_6x8s4__neonfma);
514 BENCHMARK_END2END(f32_gemm_8x8s4__neonfma);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700515#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
516
Marat Dukhan5f18d262019-10-31 10:24:14 -0700517#if XNN_ARCH_X86 || XNN_ARCH_X86_64
518 static void f32_gemm_4x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
519 GEMMEnd2EndBenchmark(state, model,
520 xnn_f32_gemm_ukernel_4x8__sse_load1,
521 xnn_f32_igemm_ukernel_4x8__sse_load1,
522 xnn_f32_gemm_ukernel_1x8__sse_load1,
523 xnn_f32_igemm_ukernel_1x8__sse_load1,
524 4 /* mr */, 8 /* nr */);
525 }
526
527 static void f32_gemm_4x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
528 GEMMEnd2EndBenchmark(state, model,
529 xnn_f32_gemm_ukernel_4x8__sse_dup,
530 xnn_f32_igemm_ukernel_4x8__sse_dup,
531 xnn_f32_gemm_ukernel_1x8__sse_dup,
532 xnn_f32_igemm_ukernel_1x8__sse_dup,
533 4 /* mr */, 8 /* nr */);
534 }
535
536 static void f32_gemm_4x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
537 GEMMEnd2EndBenchmark(state, model,
538 xnn_f32_gemm_ukernel_4x8s4__sse,
539 xnn_f32_igemm_ukernel_4x8s4__sse,
540 xnn_f32_gemm_ukernel_1x8s4__sse,
541 xnn_f32_igemm_ukernel_1x8s4__sse,
542 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
543 }
544
Marat Dukhanfda12b82019-11-21 12:27:59 -0800545 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
546 GEMMEnd2EndBenchmark(state, model,
547 xnn_f32_gemm_ukernel_4x8__avx_broadcast,
548 xnn_f32_igemm_ukernel_4x8__avx_broadcast,
549 xnn_f32_gemm_ukernel_1x8__avx_broadcast,
550 xnn_f32_igemm_ukernel_1x8__avx_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800551 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
552 benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800553 }
554
555 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
556 GEMMEnd2EndBenchmark(state, model,
557 xnn_f32_gemm_ukernel_5x8__avx_broadcast,
558 xnn_f32_igemm_ukernel_5x8__avx_broadcast,
559 xnn_f32_gemm_ukernel_1x8__avx_broadcast,
560 xnn_f32_igemm_ukernel_1x8__avx_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800561 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
562 benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800563 }
564
565 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
566 GEMMEnd2EndBenchmark(state, model,
567 xnn_f32_gemm_ukernel_6x8__avx_broadcast,
568 xnn_f32_igemm_ukernel_6x8__avx_broadcast,
569 xnn_f32_gemm_ukernel_1x8__avx_broadcast,
570 xnn_f32_igemm_ukernel_1x8__avx_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800571 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
572 benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800573 }
574
575 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
576 GEMMEnd2EndBenchmark(state, model,
577 xnn_f32_gemm_ukernel_7x8__avx_broadcast,
578 xnn_f32_igemm_ukernel_7x8__avx_broadcast,
579 xnn_f32_gemm_ukernel_1x8__avx_broadcast,
580 xnn_f32_igemm_ukernel_1x8__avx_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800581 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
582 benchmark::utils::CheckAVX);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800583 }
584
Marat Dukhaneccfd712019-12-08 16:49:27 -0800585 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
586 GEMMEnd2EndBenchmark(state, model,
587 xnn_f32_gemm_ukernel_3x16__avx_broadcast,
588 xnn_f32_igemm_ukernel_3x16__avx_broadcast,
589 xnn_f32_gemm_ukernel_1x16__avx_broadcast,
590 xnn_f32_igemm_ukernel_1x16__avx_broadcast,
591 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
592 benchmark::utils::CheckAVX);
593 }
594
595 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
596 GEMMEnd2EndBenchmark(state, model,
597 xnn_f32_gemm_ukernel_4x16__avx_broadcast,
598 xnn_f32_igemm_ukernel_4x16__avx_broadcast,
599 xnn_f32_gemm_ukernel_1x16__avx_broadcast,
600 xnn_f32_igemm_ukernel_1x16__avx_broadcast,
601 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
602 benchmark::utils::CheckAVX);
603 }
604
605 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
606 GEMMEnd2EndBenchmark(state, model,
607 xnn_f32_gemm_ukernel_5x16__avx_broadcast,
608 xnn_f32_igemm_ukernel_5x16__avx_broadcast,
609 xnn_f32_gemm_ukernel_1x16__avx_broadcast,
610 xnn_f32_igemm_ukernel_1x16__avx_broadcast,
611 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
612 benchmark::utils::CheckAVX);
613 }
614
Marat Dukhanfda12b82019-11-21 12:27:59 -0800615 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
616 GEMMEnd2EndBenchmark(state, model,
617 xnn_f32_gemm_ukernel_4x8__fma3_broadcast,
618 xnn_f32_igemm_ukernel_4x8__fma3_broadcast,
619 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
620 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800621 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
622 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800623 }
624
625 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
626 GEMMEnd2EndBenchmark(state, model,
627 xnn_f32_gemm_ukernel_5x8__fma3_broadcast,
628 xnn_f32_igemm_ukernel_5x8__fma3_broadcast,
629 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
630 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800631 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
632 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800633 }
634
635 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
636 GEMMEnd2EndBenchmark(state, model,
637 xnn_f32_gemm_ukernel_6x8__fma3_broadcast,
638 xnn_f32_igemm_ukernel_6x8__fma3_broadcast,
639 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
640 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800641 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
642 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800643 }
644
645 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
646 GEMMEnd2EndBenchmark(state, model,
647 xnn_f32_gemm_ukernel_7x8__fma3_broadcast,
648 xnn_f32_igemm_ukernel_7x8__fma3_broadcast,
649 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
650 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800651 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
652 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800653 }
654
655 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
656 GEMMEnd2EndBenchmark(state, model,
657 xnn_f32_gemm_ukernel_8x8__fma3_broadcast,
658 xnn_f32_igemm_ukernel_8x8__fma3_broadcast,
659 xnn_f32_gemm_ukernel_1x8__fma3_broadcast,
660 xnn_f32_igemm_ukernel_1x8__fma3_broadcast,
Marat Dukhanc8466f52019-11-25 18:01:10 -0800661 8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
662 benchmark::utils::CheckFMA3);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800663 }
664
Marat Dukhaneccfd712019-12-08 16:49:27 -0800665 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
666 GEMMEnd2EndBenchmark(state, model,
667 xnn_f32_gemm_ukernel_3x16__fma3_broadcast,
668 xnn_f32_igemm_ukernel_3x16__fma3_broadcast,
669 xnn_f32_gemm_ukernel_1x16__fma3_broadcast,
670 xnn_f32_igemm_ukernel_1x16__fma3_broadcast,
671 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
672 benchmark::utils::CheckFMA3);
673 }
674
675 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
676 GEMMEnd2EndBenchmark(state, model,
677 xnn_f32_gemm_ukernel_4x16__fma3_broadcast,
678 xnn_f32_igemm_ukernel_4x16__fma3_broadcast,
679 xnn_f32_gemm_ukernel_1x16__fma3_broadcast,
680 xnn_f32_igemm_ukernel_1x16__fma3_broadcast,
681 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
682 benchmark::utils::CheckFMA3);
683 }
684
685 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
686 GEMMEnd2EndBenchmark(state, model,
687 xnn_f32_gemm_ukernel_5x16__fma3_broadcast,
688 xnn_f32_igemm_ukernel_5x16__fma3_broadcast,
689 xnn_f32_gemm_ukernel_1x16__fma3_broadcast,
690 xnn_f32_igemm_ukernel_1x16__fma3_broadcast,
691 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
692 benchmark::utils::CheckFMA3);
693 }
694
Marat Dukhan27121322019-12-09 14:57:40 -0800695 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
696 GEMMEnd2EndBenchmark(state, model,
697 xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast,
698 xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast,
699 xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast,
700 xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast,
701 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
702 benchmark::utils::CheckFMA3);
703 }
704
705 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
706 GEMMEnd2EndBenchmark(state, model,
707 xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast,
708 xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast,
709 xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast,
710 xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast,
711 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
712 benchmark::utils::CheckFMA3);
713 }
714
715 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
716 GEMMEnd2EndBenchmark(state, model,
717 xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast,
718 xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast,
719 xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast,
720 xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast,
721 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
722 benchmark::utils::CheckFMA3);
723 }
724
Marat Dukhan0f349c42019-11-27 11:58:54 -0800725 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
726 GEMMEnd2EndBenchmark(state, model,
727 xnn_f32_gemm_ukernel_4x16__avx512f_broadcast,
728 xnn_f32_igemm_ukernel_4x16__avx512f_broadcast,
729 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
730 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
731 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
732 benchmark::utils::CheckAVX512F);
733 }
734
735 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
736 GEMMEnd2EndBenchmark(state, model,
737 xnn_f32_gemm_ukernel_5x16__avx512f_broadcast,
738 xnn_f32_igemm_ukernel_5x16__avx512f_broadcast,
739 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
740 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
741 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
742 benchmark::utils::CheckAVX512F);
743 }
744
745 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
746 GEMMEnd2EndBenchmark(state, model,
747 xnn_f32_gemm_ukernel_6x16__avx512f_broadcast,
748 xnn_f32_igemm_ukernel_6x16__avx512f_broadcast,
749 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
750 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
751 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
752 benchmark::utils::CheckAVX512F);
753 }
754
755 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
756 GEMMEnd2EndBenchmark(state, model,
757 xnn_f32_gemm_ukernel_7x16__avx512f_broadcast,
758 xnn_f32_igemm_ukernel_7x16__avx512f_broadcast,
759 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
760 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
761 7 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
762 benchmark::utils::CheckAVX512F);
763 }
764
765 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
766 GEMMEnd2EndBenchmark(state, model,
767 xnn_f32_gemm_ukernel_8x16__avx512f_broadcast,
768 xnn_f32_igemm_ukernel_8x16__avx512f_broadcast,
769 xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
770 xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
771 8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
772 benchmark::utils::CheckAVX512F);
773 }
774
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800775 BENCHMARK_END2END(f32_gemm_4x8__sse_load1);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700776
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800777 BENCHMARK_END2END(f32_gemm_4x8__sse_dup);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700778
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800779 BENCHMARK_END2END(f32_gemm_4x8s4__sse);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800780
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800781 BENCHMARK_END2END(f32_gemm_4x8__avx_broadcast);
782 BENCHMARK_END2END(f32_gemm_5x8__avx_broadcast);
783 BENCHMARK_END2END(f32_gemm_6x8__avx_broadcast);
784 BENCHMARK_END2END(f32_gemm_7x8__avx_broadcast);
785 BENCHMARK_END2END(f32_gemm_3x16__avx_broadcast);
786 BENCHMARK_END2END(f32_gemm_4x16__avx_broadcast);
787 BENCHMARK_END2END(f32_gemm_5x16__avx_broadcast);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800788
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800789 BENCHMARK_END2END(f32_gemm_4x8__fma3_broadcast);
790 BENCHMARK_END2END(f32_gemm_5x8__fma3_broadcast);
791 BENCHMARK_END2END(f32_gemm_6x8__fma3_broadcast);
792 BENCHMARK_END2END(f32_gemm_7x8__fma3_broadcast);
793 BENCHMARK_END2END(f32_gemm_8x8__fma3_broadcast);
794 BENCHMARK_END2END(f32_gemm_3x16__fma3_broadcast);
795 BENCHMARK_END2END(f32_gemm_4x16__fma3_broadcast);
796 BENCHMARK_END2END(f32_gemm_5x16__fma3_broadcast);
Marat Dukhanfda12b82019-11-21 12:27:59 -0800797
Marat Dukhan27121322019-12-09 14:57:40 -0800798 BENCHMARK_END2END(f32_gemm_3x16s4__fma3_broadcast);
799 BENCHMARK_END2END(f32_gemm_4x16s4__fma3_broadcast);
800 BENCHMARK_END2END(f32_gemm_5x16s4__fma3_broadcast);
801
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800802 BENCHMARK_END2END(f32_gemm_4x16__avx512f_broadcast);
803 BENCHMARK_END2END(f32_gemm_5x16__avx512f_broadcast);
804 BENCHMARK_END2END(f32_gemm_6x16__avx512f_broadcast);
805 BENCHMARK_END2END(f32_gemm_7x16__avx512f_broadcast);
806 BENCHMARK_END2END(f32_gemm_8x16__avx512f_broadcast);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700807#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
808
809#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
810 static void f32_gemm_4x8__psimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
811 GEMMEnd2EndBenchmark(state, model,
812 xnn_f32_gemm_ukernel_4x8__psimd_loadsplat,
813 xnn_f32_igemm_ukernel_4x8__psimd_loadsplat,
814 xnn_f32_gemm_ukernel_1x8__psimd_loadsplat,
815 xnn_f32_igemm_ukernel_1x8__psimd_loadsplat,
816 4 /* mr */, 8 /* nr */);
817 }
818
819 static void f32_gemm_6x8__psimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
820 GEMMEnd2EndBenchmark(state, model,
821 xnn_f32_gemm_ukernel_6x8__psimd_loadsplat,
822 xnn_f32_igemm_ukernel_6x8__psimd_loadsplat,
823 xnn_f32_gemm_ukernel_1x8__psimd_loadsplat,
824 xnn_f32_igemm_ukernel_1x8__psimd_loadsplat,
825 6 /* mr */, 8 /* nr */);
826 }
827
828 static void f32_gemm_4x8__psimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
829 GEMMEnd2EndBenchmark(state, model,
830 xnn_f32_gemm_ukernel_4x8__psimd_splat,
831 xnn_f32_igemm_ukernel_4x8__psimd_splat,
832 xnn_f32_gemm_ukernel_1x8__psimd_splat,
833 xnn_f32_igemm_ukernel_1x8__psimd_splat,
834 4 /* mr */, 8 /* nr */);
835 }
836
837 static void f32_gemm_6x8__psimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
838 GEMMEnd2EndBenchmark(state, model,
839 xnn_f32_gemm_ukernel_6x8__psimd_splat,
840 xnn_f32_igemm_ukernel_6x8__psimd_splat,
841 xnn_f32_gemm_ukernel_1x8__psimd_splat,
842 xnn_f32_igemm_ukernel_1x8__psimd_splat,
843 6 /* mr */, 8 /* nr */);
844 }
845
846 static void f32_gemm_4x8s4__psimd(benchmark::State& state, models::ExecutionPlanFactory model) {
847 GEMMEnd2EndBenchmark(state, model,
848 xnn_f32_gemm_ukernel_4x8s4__psimd,
849 xnn_f32_igemm_ukernel_4x8s4__psimd,
850 xnn_f32_gemm_ukernel_1x8s4__psimd,
851 xnn_f32_igemm_ukernel_1x8s4__psimd,
852 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
853 }
854
855 static void f32_gemm_6x8s4__psimd(benchmark::State& state, models::ExecutionPlanFactory model) {
856 GEMMEnd2EndBenchmark(state, model,
857 xnn_f32_gemm_ukernel_6x8s4__psimd,
858 xnn_f32_igemm_ukernel_6x8s4__psimd,
859 xnn_f32_gemm_ukernel_1x8s4__psimd,
860 xnn_f32_igemm_ukernel_1x8s4__psimd,
861 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
862 }
863
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800864 BENCHMARK_END2END(f32_gemm_4x8__psimd_loadsplat);
865 BENCHMARK_END2END(f32_gemm_6x8__psimd_loadsplat);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700866
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800867 BENCHMARK_END2END(f32_gemm_4x8__psimd_splat);
868 BENCHMARK_END2END(f32_gemm_6x8__psimd_splat);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700869
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800870 BENCHMARK_END2END(f32_gemm_4x8s4__psimd);
871 BENCHMARK_END2END(f32_gemm_6x8s4__psimd);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700872#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
873
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800874#if XNN_ARCH_WASM
875 static void f32_gemm_2x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
876 GEMMEnd2EndBenchmark(state, model,
877 xnn_f32_gemm_ukernel_2x4__wasm,
878 xnn_f32_igemm_ukernel_2x4__wasm,
879 xnn_f32_gemm_ukernel_1x4__wasm,
880 xnn_f32_igemm_ukernel_1x4__wasm,
881 2 /* mr */, 4 /* nr */);
882 }
883
884 static void f32_gemm_4x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
885 GEMMEnd2EndBenchmark(state, model,
886 xnn_f32_gemm_ukernel_4x4__wasm,
887 xnn_f32_igemm_ukernel_4x4__wasm,
888 xnn_f32_gemm_ukernel_1x4__wasm,
889 xnn_f32_igemm_ukernel_1x4__wasm,
890 4 /* mr */, 4 /* nr */);
891 }
892
893 BENCHMARK_END2END(f32_gemm_2x4__wasm);
894 BENCHMARK_END2END(f32_gemm_4x4__wasm);
895#endif // XNN_ARCH_WASM
896
Marat Dukhan5f18d262019-10-31 10:24:14 -0700897static void f32_gemm_2x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
898 GEMMEnd2EndBenchmark(state, model,
899 xnn_f32_gemm_ukernel_2x4__scalar,
900 xnn_f32_igemm_ukernel_2x4__scalar,
901 xnn_f32_gemm_ukernel_1x4__scalar,
902 xnn_f32_igemm_ukernel_1x4__scalar,
903 2 /* mr */, 4 /* nr */);
904}
905
906static void f32_gemm_4x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
907 GEMMEnd2EndBenchmark(state, model,
908 xnn_f32_gemm_ukernel_4x4__scalar,
909 xnn_f32_igemm_ukernel_4x4__scalar,
910 xnn_f32_gemm_ukernel_1x4__scalar,
911 xnn_f32_igemm_ukernel_1x4__scalar,
912 4 /* mr */, 4 /* nr */);
913}
914
Marat Dukhanc08cdf52019-12-09 09:17:51 -0800915BENCHMARK_END2END(f32_gemm_2x4__scalar);
916BENCHMARK_END2END(f32_gemm_4x4__scalar);
Marat Dukhan5f18d262019-10-31 10:24:14 -0700917
918#ifndef XNNPACK_BENCHMARK_NO_MAIN
919BENCHMARK_MAIN();
920#endif