blob: e2cfcb88fd5f3b5aa8e6cc8bd246405cca9ace5e [file] [log] [blame]
Jared Duke13689fe2019-04-16 16:22:07 -04001/* Copyright 2019 Google LLC. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -040016#include "tune.h"
17
18#include <algorithm>
19#include <cstdint>
20
21#include "opt_set.h"
22#include "time.h"
23
24namespace ruy {
25
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -040026#ifdef __aarch64__
Benoit Jacobf9f30f92019-04-09 12:10:14 -040027
28namespace {
29
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -040030void PoorlyOrderedKernel(int iters) {
31 asm volatile(
32 "mov w0, %w[iters]\n"
33 "1:\n"
34 "subs w0, w0, #1\n"
35 "mul v0.4s, v0.4s, v0.4s\n"
36 "mul v0.4s, v0.4s, v0.4s\n"
37 "mul v0.4s, v0.4s, v0.4s\n"
38 "mul v0.4s, v0.4s, v0.4s\n"
39 "mul v1.4s, v1.4s, v1.4s\n"
40 "mul v1.4s, v1.4s, v1.4s\n"
41 "mul v1.4s, v1.4s, v1.4s\n"
42 "mul v1.4s, v1.4s, v1.4s\n"
43 "mul v2.4s, v2.4s, v2.4s\n"
44 "mul v2.4s, v2.4s, v2.4s\n"
45 "mul v2.4s, v2.4s, v2.4s\n"
46 "mul v2.4s, v2.4s, v2.4s\n"
47 "mul v3.4s, v3.4s, v3.4s\n"
48 "mul v3.4s, v3.4s, v3.4s\n"
49 "mul v3.4s, v3.4s, v3.4s\n"
50 "mul v3.4s, v3.4s, v3.4s\n"
51 "bne 1b\n" ::[iters] "r"(iters)
52 : "cc", "x0", "v0", "v1", "v2", "v3");
53}
54
55void NicelyOrderedKernel(int iters) {
56 asm volatile(
57 "mov w0, %w[iters]\n"
58 "1:\n"
59 "subs w0, w0, #1\n"
60 "mul v0.4s, v0.4s, v0.4s\n"
61 "mul v1.4s, v1.4s, v1.4s\n"
62 "mul v2.4s, v2.4s, v2.4s\n"
63 "mul v3.4s, v3.4s, v3.4s\n"
64 "mul v0.4s, v0.4s, v0.4s\n"
65 "mul v1.4s, v1.4s, v1.4s\n"
66 "mul v2.4s, v2.4s, v2.4s\n"
67 "mul v3.4s, v3.4s, v3.4s\n"
68 "mul v0.4s, v0.4s, v0.4s\n"
69 "mul v1.4s, v1.4s, v1.4s\n"
70 "mul v2.4s, v2.4s, v2.4s\n"
71 "mul v3.4s, v3.4s, v3.4s\n"
72 "mul v0.4s, v0.4s, v0.4s\n"
73 "mul v1.4s, v1.4s, v1.4s\n"
74 "mul v2.4s, v2.4s, v2.4s\n"
75 "mul v3.4s, v3.4s, v3.4s\n"
76 "bne 1b\n" ::[iters] "r"(iters)
77 : "cc", "x0", "v0", "v1", "v2", "v3");
78}
79
Benoit Jacobf9f30f92019-04-09 12:10:14 -040080} // namespace
81
82float TuningResolver::EvalRatio() {
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -040083 // With the current settings, 400 iterations and 4 repeats, this test has
84 // a latency of roughly 80 microseconds on a Cortex-A53 at 1.4 GHz.
85 static constexpr int kLoopIters = 400;
86 static constexpr int kRepeats = 4;
87
Benoit Jacob842bfaf2019-04-11 10:43:28 -040088 Duration timing_poorly_ordered = Duration::max();
89 Duration timing_nicely_ordered = Duration::max();
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -040090
91 for (int r = 0; r < kRepeats; r++) {
Benoit Jacob842bfaf2019-04-11 10:43:28 -040092 TimePoint t0 = Clock::now();
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -040093 PoorlyOrderedKernel(kLoopIters);
Benoit Jacob842bfaf2019-04-11 10:43:28 -040094 TimePoint t1 = Clock::now();
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -040095 NicelyOrderedKernel(kLoopIters);
Benoit Jacob842bfaf2019-04-11 10:43:28 -040096 TimePoint t2 = Clock::now();
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -040097 timing_poorly_ordered = std::min(timing_poorly_ordered, t1 - t0);
98 timing_nicely_ordered = std::min(timing_nicely_ordered, t2 - t1);
99 }
100
Benoit Jacob842bfaf2019-04-11 10:43:28 -0400101 return ToSeconds(timing_nicely_ordered) / ToSeconds(timing_poorly_ordered);
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400102}
103
Benoit Jacobf9f30f92019-04-09 12:10:14 -0400104float TuningResolver::ThresholdRatio() {
105 // Empirically (see :tune_tool) determined threshold to distinguish in-order
106 // Cortex-A53/A55 cores from out-of-order Cortex-A57/A73/A75/A76 cores. Based
107 // on these experimental results, which were obtained with much lower
108 // (kLoopIters=1000, kRepeats=1) so as to make them resilient to noise, we
109 // have:
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400110 //
111 // CPU core type | in/out of order | observed ratio
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400112 // --------------+-----------------+-----------------------------------------
113 // Cortex-A53 | in-order | 0.32 -- 0.329
114 // Cortex-A55 | in-order | 0.319 -- 0.325
115 // Cortex-A55r1 | in-order | 0.319 -- 0.325
116 // Cortex-A57 | out-of-order | 0.99 -- 1.01
117 // Cortex-A73 | out-of-order | 0.922 -- 0.927
118 // Cortex-A75 | out-of-order | 0.921 -- 0.93
119 // Cortex-A76 | out-of-order | 1
120 // Kryo (pixel1) | out-of-order | 0.73 -- 0.76
121 //
122 // Thus the allowable range for the threshold is [0.35 .. 0.70].
123 // We pick a value closer to the upper bound because really any out-of-order
124 // CPU should by definition produce a ratio close to 1.
125 return 0.65f;
126}
127
Benoit Jacobf9f30f92019-04-09 12:10:14 -0400128Tuning TuningResolver::ResolveNow() {
129 const bool is_probably_inorder = EvalRatio() < ThresholdRatio();
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400130 return is_probably_inorder ? Tuning::kInOrder : Tuning::kOutOfOrder;
131}
132
133#else // not defined __aarch64__
134
Benoit Jacobf9f30f92019-04-09 12:10:14 -0400135float TuningResolver::EvalRatio() { return 0; }
136float TuningResolver::ThresholdRatio() { return 0; }
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400137
Benoit Jacobf9f30f92019-04-09 12:10:14 -0400138Tuning TuningResolver::ResolveNow() { return Tuning::kOutOfOrder; }
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400139
140#endif
141
Benoit Jacob842bfaf2019-04-11 10:43:28 -0400142static constexpr double kExpirySecs = 0.25;
143
144TuningResolver::TuningResolver()
145 : expiry_duration_(DurationFromSeconds(kExpirySecs)) {}
146
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400147Tuning TuningResolver::Resolve() {
148#if (defined RUY_OPT_SET) && !(RUY_OPT_SET & RUY_OPT_TUNING)
149 return Tuning::kOutOfOrder;
150#endif
151 if (unresolved_tuning_ != Tuning::kAuto) {
152 return unresolved_tuning_;
153 }
Benoit Jacob842bfaf2019-04-11 10:43:28 -0400154 TimePoint new_timepoint = Clock::now();
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400155 if (last_resolved_tuning_ != Tuning::kAuto &&
Benoit Jacob842bfaf2019-04-11 10:43:28 -0400156 (new_timepoint - last_resolved_timepoint_) < expiry_duration_) {
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400157 return last_resolved_tuning_;
158 }
Benoit Jacob842bfaf2019-04-11 10:43:28 -0400159 last_resolved_timepoint_ = new_timepoint;
Benoit Jacobf9f30f92019-04-09 12:10:14 -0400160 last_resolved_tuning_ = ResolveNow();
Benoit Jacoba0ba3ac2019-04-08 12:00:37 -0400161 return last_resolved_tuning_;
162}
163
164} // namespace ruy