Primiano Tucci | df3ab20 | 2020-05-21 14:20:57 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2020 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | // See /docs/design-docs/protozero.md for rationale and results. |
| 18 | |
| 19 | #include <memory> |
| 20 | #include <vector> |
| 21 | |
| 22 | #include <unistd.h> |
| 23 | |
| 24 | #include <benchmark/benchmark.h> |
| 25 | |
| 26 | #include "perfetto/base/compiler.h" |
| 27 | #include "perfetto/protozero/static_buffer.h" |
| 28 | |
| 29 | // Autogenerated headers in out/*/gen/ |
| 30 | #include "src/protozero/test/example_proto/library.pbzero.h" |
| 31 | #include "src/protozero/test/example_proto/test_messages.pb.h" |
| 32 | #include "src/protozero/test/example_proto/test_messages.pbzero.h" |
| 33 | |
| 34 | // Generated by the protozero plugin. |
| 35 | namespace pbzero = protozero::test::protos::pbzero; |
| 36 | |
| 37 | // Generated by the official protobuf compiler. |
| 38 | namespace pblite = protozero::test::protos; |
| 39 | |
| 40 | namespace { |
| 41 | |
| 42 | // This needs to be > the max size written by each iteration. |
| 43 | constexpr size_t kBufPerIteration = 512; |
| 44 | |
| 45 | // Write cyclically on a 64 MB buffer set to simulate a realistic tracing |
| 46 | // scenario. |
| 47 | constexpr size_t kTotalWorkingSetSize = 64 * 1024 * 1024; |
| 48 | alignas(uint64_t) char g_out_buffer[kTotalWorkingSetSize]; |
| 49 | |
| 50 | char* g_cur = g_out_buffer; |
| 51 | |
| 52 | uint64_t g_fake_input_simple[] = {0x12345678, |
| 53 | 0x90ABCDEF, |
| 54 | 0x11111111, |
| 55 | 0xFFFFFFFF, |
| 56 | 0x6666666666666666ULL, |
| 57 | 0x6666666666666666ULL, |
| 58 | 0x6666666666666666ULL, |
| 59 | 0x0066666666666666ULL}; |
| 60 | |
| 61 | // Speed-of-light serializer. Aa very simple C++ class that just appends data |
| 62 | // into a linear buffer making all sorts of favourable assumptions. It does not |
| 63 | // use any binary-stable encoding, it does not perform bound checking, |
| 64 | // all writes are 64-bit aligned, it doesn't deal with any thread-safety. |
| 65 | // The speed-of-light serializer serves as a reference for how fast a serializer |
| 66 | // could be if argument marshalling and bound checking were zero cost. |
| 67 | struct SOLMsg { |
| 68 | template <typename T> |
| 69 | void Append(T x) { |
| 70 | // The reinterpret_cast is to give favorable alignment guarantees. |
| 71 | memcpy(reinterpret_cast<T*>(ptr_), &x, sizeof(x)); |
| 72 | ptr_ += sizeof(x); |
| 73 | } |
| 74 | |
| 75 | void set_field_int32(int32_t x) { Append(x); } |
| 76 | void set_field_uint32(uint32_t x) { Append(x); } |
| 77 | void set_field_int64(int64_t x) { Append(x); } |
| 78 | void set_field_uint64(uint64_t x) { Append(x); } |
| 79 | void set_field_string(const char* str) { ptr_ = strcpy(ptr_, str); } |
| 80 | |
| 81 | SOLMsg* add_field_nested() { return new (this + 1) SOLMsg(); } |
| 82 | |
| 83 | char storage_[sizeof(g_fake_input_simple)]; |
| 84 | char* ptr_ = &storage_[0]; |
| 85 | }; |
| 86 | |
| 87 | template <typename T> |
| 88 | PERFETTO_ALWAYS_INLINE void FillMessage_Simple(T* msg) { |
| 89 | benchmark::DoNotOptimize(g_fake_input_simple); |
| 90 | msg->set_field_int32(static_cast<int32_t>(g_fake_input_simple[0])); |
| 91 | msg->set_field_uint32(static_cast<uint32_t>(g_fake_input_simple[1])); |
| 92 | msg->set_field_int64(static_cast<int64_t>(g_fake_input_simple[2])); |
| 93 | msg->set_field_uint64(static_cast<uint64_t>(g_fake_input_simple[3])); |
| 94 | msg->set_field_string(reinterpret_cast<const char*>(&g_fake_input_simple[4])); |
| 95 | } |
| 96 | |
| 97 | template <typename T> |
| 98 | PERFETTO_ALWAYS_INLINE void FillMessage_Nested(T* msg, int depth = 0) { |
| 99 | benchmark::DoNotOptimize(g_fake_input_simple); |
| 100 | FillMessage_Simple(msg); |
| 101 | if (depth < 3) { |
| 102 | auto* child = msg->add_field_nested(); |
| 103 | FillMessage_Nested(child, depth + 1); |
| 104 | } |
| 105 | } |
| 106 | |
| 107 | PERFETTO_ALWAYS_INLINE void Clobber(benchmark::State& state) { |
| 108 | uint64_t* buf = reinterpret_cast<uint64_t*>(g_cur); |
| 109 | |
| 110 | // Read-back the data written to have a realistic evaluation of the |
| 111 | // speed-of-light scenario. This is to deal with architecture of modern CPUs. |
| 112 | // If we write a bunch of memory bytes, never read-back from them, and then |
| 113 | // just over-write them, the CPU can just throw away the whole stream of |
| 114 | // instructions that produced them, if that's still in flight and tracked in |
| 115 | // the out-of-order units. |
| 116 | // The buf[i-1] ^= buf forces the CPU to consume the result of the writes. |
| 117 | buf[0] = reinterpret_cast<uint64_t>(&state); |
| 118 | for (size_t i = 1; i < kBufPerIteration / sizeof(uint64_t); i++) |
| 119 | buf[i] ^= buf[i - 1]; |
Lalit Maganti | 15b3c02 | 2020-06-05 13:05:53 +0100 | [diff] [blame] | 120 | if (buf[(kBufPerIteration / sizeof(uint64_t)) - 1] == 42) |
Primiano Tucci | df3ab20 | 2020-05-21 14:20:57 +0100 | [diff] [blame] | 121 | PERFETTO_CHECK(false); |
| 122 | benchmark::DoNotOptimize(buf); |
| 123 | |
| 124 | constexpr size_t kWrap = kTotalWorkingSetSize / kBufPerIteration; |
| 125 | g_cur = &g_out_buffer[(state.iterations() % kWrap) * kBufPerIteration]; |
| 126 | benchmark::ClobberMemory(); |
| 127 | } |
| 128 | |
| 129 | } // namespace |
| 130 | |
| 131 | static void BM_Protozero_Simple_Libprotobuf(benchmark::State& state) { |
| 132 | while (state.KeepRunning()) { |
| 133 | { |
| 134 | // The nested block is to account for RAII finalizers. |
| 135 | pblite::EveryField msg; |
| 136 | FillMessage_Simple(&msg); |
| 137 | msg.SerializeToArray(g_cur, kBufPerIteration); |
| 138 | } |
| 139 | Clobber(state); |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | static void BM_Protozero_Simple_Protozero(benchmark::State& state) { |
| 144 | while (state.KeepRunning()) { |
| 145 | { |
| 146 | protozero::StaticBuffered<pbzero::EveryField> msg(g_cur, |
| 147 | kBufPerIteration); |
| 148 | FillMessage_Simple(msg.get()); |
| 149 | } |
| 150 | Clobber(state); |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | static void BM_Protozero_Simple_SpeedOfLight(benchmark::State& state) { |
| 155 | while (state.KeepRunning()) { |
| 156 | SOLMsg* msg = new (g_cur) SOLMsg(); |
| 157 | FillMessage_Simple(msg); |
| 158 | Clobber(state); |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | static void BM_Protozero_Nested_Libprotobuf(benchmark::State& state) { |
| 163 | while (state.KeepRunning()) { |
| 164 | { |
| 165 | pblite::EveryField msg; |
| 166 | FillMessage_Nested(&msg); |
| 167 | msg.SerializeToArray(g_cur, kBufPerIteration); |
| 168 | } |
| 169 | Clobber(state); |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | static void BM_Protozero_Nested_Protozero(benchmark::State& state) { |
| 174 | while (state.KeepRunning()) { |
| 175 | { |
| 176 | protozero::StaticBuffered<pbzero::EveryField> msg(g_cur, |
| 177 | kBufPerIteration); |
| 178 | FillMessage_Nested(msg.get()); |
| 179 | } |
| 180 | Clobber(state); |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | static void BM_Protozero_Nested_SpeedOfLight(benchmark::State& state) { |
| 185 | while (state.KeepRunning()) { |
| 186 | SOLMsg* msg = new (g_cur) SOLMsg(); |
| 187 | FillMessage_Nested(msg); |
| 188 | Clobber(state); |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | BENCHMARK(BM_Protozero_Simple_Libprotobuf); |
| 193 | BENCHMARK(BM_Protozero_Simple_Protozero); |
| 194 | BENCHMARK(BM_Protozero_Simple_SpeedOfLight); |
| 195 | |
| 196 | BENCHMARK(BM_Protozero_Nested_Libprotobuf); |
| 197 | BENCHMARK(BM_Protozero_Nested_Protozero); |
| 198 | BENCHMARK(BM_Protozero_Nested_SpeedOfLight); |