Wyatt Hepler | 80c6ee5 | 2020-01-03 09:54:58 -0800 | [diff] [blame] | 1 | // Copyright 2020 The Pigweed Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| 4 | // use this file except in compliance with the License. You may obtain a copy of |
| 5 | // the License at |
| 6 | // |
| 7 | // https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| 11 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| 12 | // License for the specific language governing permissions and limitations under |
| 13 | // the License. |
| 14 | |
| 15 | #include "pw_tokenizer/detokenize.h" |
| 16 | |
| 17 | #include <algorithm> |
| 18 | |
| 19 | #include "pw_tokenizer/internal/decode.h" |
| 20 | |
| 21 | namespace pw::tokenizer { |
| 22 | namespace { |
| 23 | |
| 24 | std::string UnknownTokenMessage(uint32_t value) { |
| 25 | std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token "); |
| 26 | |
| 27 | // Output a hexadecimal version of the token. |
| 28 | for (int shift = 28; shift >= 0; shift -= 4) { |
| 29 | output.push_back("0123456789abcdef"[(value >> shift) & 0xF]); |
| 30 | } |
| 31 | |
| 32 | output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX); |
| 33 | return output; |
| 34 | } |
| 35 | |
| 36 | // Decoding result with the date removed, for sorting. |
| 37 | using DecodingResult = std::pair<DecodedFormatString, uint32_t>; |
| 38 | |
| 39 | // Determines if one result is better than the other if collisions occurred. |
| 40 | // Returns true if lhs is preferred over rhs. This logic should match the |
| 41 | // collision resolution logic in detokenize.py. |
| 42 | bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) { |
| 43 | // Favor the result for which decoding succeeded. |
| 44 | if (lhs.first.ok() != rhs.first.ok()) { |
| 45 | return lhs.first.ok(); |
| 46 | } |
| 47 | |
| 48 | // Favor the result for which all bytes were decoded. |
| 49 | if ((lhs.first.remaining_bytes() == 0u) != |
| 50 | (rhs.first.remaining_bytes() == 0u)) { |
| 51 | return lhs.first.remaining_bytes() == 0u; |
| 52 | } |
| 53 | |
| 54 | // Favor the result with fewer decoding errors. |
| 55 | if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) { |
| 56 | return lhs.first.decoding_errors() < rhs.first.decoding_errors(); |
| 57 | } |
| 58 | |
| 59 | // Favor the result that successfully decoded the most arguments. |
| 60 | if (lhs.first.argument_count() != rhs.first.argument_count()) { |
| 61 | return lhs.first.argument_count() > rhs.first.argument_count(); |
| 62 | } |
| 63 | |
| 64 | // Favor the result that was removed from the database most recently. |
| 65 | return lhs.second > rhs.second; |
| 66 | } |
| 67 | |
| 68 | } // namespace |
| 69 | |
| 70 | DetokenizedString::DetokenizedString( |
| 71 | uint32_t token, |
Wyatt Hepler | e2cbadf | 2020-06-22 11:21:45 -0700 | [diff] [blame] | 72 | const std::span<const TokenizedStringEntry>& entries, |
| 73 | const std::span<const uint8_t>& arguments) |
Wyatt Hepler | 80c6ee5 | 2020-01-03 09:54:58 -0800 | [diff] [blame] | 74 | : token_(token), has_token_(true) { |
| 75 | std::vector<DecodingResult> results; |
| 76 | |
| 77 | for (const auto& [format, date_removed] : entries) { |
| 78 | results.push_back(DecodingResult{format.Format(arguments), date_removed}); |
| 79 | } |
| 80 | |
| 81 | std::sort(results.begin(), results.end(), IsBetterResult); |
| 82 | |
| 83 | for (auto& result : results) { |
| 84 | matches_.push_back(std::move(result.first)); |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | std::string DetokenizedString::BestString() const { |
| 89 | return matches_.empty() ? std::string() : matches_[0].value(); |
| 90 | } |
| 91 | |
| 92 | std::string DetokenizedString::BestStringWithErrors() const { |
| 93 | if (matches_.empty()) { |
| 94 | return has_token_ ? UnknownTokenMessage(token_) |
| 95 | : PW_TOKENIZER_ARG_DECODING_ERROR("missing token"); |
| 96 | } |
| 97 | return matches_[0].value_with_errors(); |
| 98 | } |
| 99 | |
| 100 | Detokenizer::Detokenizer(const TokenDatabase& database) { |
| 101 | for (const auto& entry : database) { |
| 102 | database_[entry.token].emplace_back(entry.string, entry.date_removed); |
| 103 | } |
| 104 | } |
| 105 | |
| 106 | DetokenizedString Detokenizer::Detokenize( |
Wyatt Hepler | e2cbadf | 2020-06-22 11:21:45 -0700 | [diff] [blame] | 107 | const std::span<const uint8_t>& encoded) const { |
Wyatt Hepler | 80c6ee5 | 2020-01-03 09:54:58 -0800 | [diff] [blame] | 108 | // The token is missing from the encoded data; there is nothing to do. |
| 109 | if (encoded.size() < sizeof(uint32_t)) { |
| 110 | return DetokenizedString(); |
| 111 | } |
| 112 | |
| 113 | const uint32_t token = |
| 114 | encoded[3] << 24 | encoded[2] << 16 | encoded[1] << 8 | encoded[0]; |
| 115 | |
| 116 | const auto result = database_.find(token); |
| 117 | |
| 118 | return DetokenizedString(token, |
| 119 | result == database_.end() |
Wyatt Hepler | e2cbadf | 2020-06-22 11:21:45 -0700 | [diff] [blame] | 120 | ? std::span<TokenizedStringEntry>() |
| 121 | : std::span(result->second), |
Wyatt Hepler | 80c6ee5 | 2020-01-03 09:54:58 -0800 | [diff] [blame] | 122 | encoded.subspan(sizeof(token))); |
| 123 | } |
| 124 | |
| 125 | } // namespace pw::tokenizer |