Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2015 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #ifndef ART_CMDLINE_TOKEN_RANGE_H_ |
| 18 | #define ART_CMDLINE_TOKEN_RANGE_H_ |
| 19 | |
| 20 | #include <assert.h> |
Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 21 | #include <algorithm> |
| 22 | #include <memory> |
Andreas Gampe | 8cf9cb3 | 2017-07-19 09:28:38 -0700 | [diff] [blame] | 23 | #include <string> |
| 24 | #include <vector> |
Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 25 | |
Andreas Gampe | 9186ced | 2016-12-12 14:28:21 -0800 | [diff] [blame] | 26 | #include "android-base/strings.h" |
| 27 | |
Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 28 | namespace art { |
| 29 | // A range of tokens to make token matching algorithms easier. |
| 30 | // |
| 31 | // We try really hard to avoid copying and store only a pointer and iterators to the |
| 32 | // interiors of the vector, so a typical copy constructor never ends up doing a deep copy. |
| 33 | // It is up to the user to play nice and not to mutate the strings in-place. |
| 34 | // |
| 35 | // Tokens are only copied if a mutating operation is performed (and even then only |
| 36 | // if it *actually* mutates the token). |
| 37 | struct TokenRange { |
| 38 | // Short-hand for a vector of strings. A single string and a token is synonymous. |
| 39 | using TokenList = std::vector<std::string>; |
| 40 | |
| 41 | // Copying-from-vector constructor. |
| 42 | explicit TokenRange(const TokenList& token_list) |
| 43 | : token_list_(new TokenList(token_list)), |
| 44 | begin_(token_list_->begin()), |
| 45 | end_(token_list_->end()) |
| 46 | {} |
| 47 | |
| 48 | // Copying-from-iterator constructor |
| 49 | template <typename ForwardIterator> |
Roland Levillain | 3887c46 | 2015-08-12 18:15:42 +0100 | [diff] [blame] | 50 | TokenRange(ForwardIterator it_begin, ForwardIterator it_end) |
Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 51 | : token_list_(new TokenList(it_begin, it_end)), |
| 52 | begin_(token_list_->begin()), |
| 53 | end_(token_list_->end()) |
| 54 | {} |
| 55 | |
| 56 | #if 0 |
| 57 | // Copying-from-vector constructor. |
| 58 | TokenRange(const TokenList& token_list ATTRIBUTE_UNUSED, |
| 59 | TokenList::const_iterator it_begin, |
| 60 | TokenList::const_iterator it_end) |
| 61 | : token_list_(new TokenList(it_begin, it_end)), |
| 62 | begin_(token_list_->begin()), |
| 63 | end_(token_list_->end()) { |
| 64 | assert(it_begin >= token_list.begin()); |
| 65 | assert(it_end <= token_list.end()); |
| 66 | } |
| 67 | #endif |
| 68 | |
| 69 | // Copying from char array constructor, convertings into tokens (strings) along the way. |
| 70 | TokenRange(const char* token_list[], size_t length) |
| 71 | : token_list_(new TokenList(&token_list[0], &token_list[length])), |
| 72 | begin_(token_list_->begin()), |
| 73 | end_(token_list_->end()) |
| 74 | {} |
| 75 | |
| 76 | // Non-copying move-from-vector constructor. Takes over the token vector. |
| 77 | explicit TokenRange(TokenList&& token_list) |
| 78 | : token_list_(new TokenList(std::forward<TokenList>(token_list))), |
| 79 | begin_(token_list_->begin()), |
| 80 | end_(token_list_->end()) |
| 81 | {} |
| 82 | |
| 83 | // Non-copying constructor. Retain reference to existing list of tokens. |
| 84 | TokenRange(std::shared_ptr<TokenList> token_list, |
| 85 | TokenList::const_iterator it_begin, |
| 86 | TokenList::const_iterator it_end) |
| 87 | : token_list_(token_list), |
| 88 | begin_(it_begin), |
| 89 | end_(it_end) { |
| 90 | assert(it_begin >= token_list->begin()); |
| 91 | assert(it_end <= token_list->end()); |
| 92 | } |
| 93 | |
| 94 | // Non-copying copy constructor. |
Andreas Gampe | c801f0d | 2015-02-24 20:55:16 -0800 | [diff] [blame] | 95 | TokenRange(const TokenRange&) = default; |
Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 96 | |
| 97 | // Non-copying move constructor. |
Andreas Gampe | c801f0d | 2015-02-24 20:55:16 -0800 | [diff] [blame] | 98 | TokenRange(TokenRange&&) = default; |
Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 99 | |
| 100 | // Non-copying constructor. Retains reference to an existing list of tokens, with offset. |
| 101 | explicit TokenRange(std::shared_ptr<TokenList> token_list) |
| 102 | : token_list_(token_list), |
| 103 | begin_(token_list_->begin()), |
| 104 | end_(token_list_->end()) |
| 105 | {} |
| 106 | |
| 107 | // Iterator type for begin() and end(). Guaranteed to be a RandomAccessIterator. |
| 108 | using iterator = TokenList::const_iterator; |
| 109 | |
| 110 | // Iterator type for const begin() and const end(). Guaranteed to be a RandomAccessIterator. |
| 111 | using const_iterator = iterator; |
| 112 | |
| 113 | // Create a token range by splitting a string. Each separator gets their own token. |
| 114 | // Since the separator are retained as tokens, it might be useful to call |
| 115 | // RemoveToken afterwards. |
| 116 | static TokenRange Split(const std::string& string, std::initializer_list<char> separators) { |
| 117 | TokenList new_token_list; |
| 118 | |
| 119 | std::string tok; |
| 120 | for (auto&& c : string) { |
| 121 | for (char sep : separators) { |
| 122 | if (c == sep) { |
| 123 | // We spotted a separator character. |
| 124 | // Push back everything before the last separator as a new token. |
| 125 | // Push back the separator as a token. |
| 126 | if (!tok.empty()) { |
| 127 | new_token_list.push_back(tok); |
| 128 | tok = ""; |
| 129 | } |
| 130 | new_token_list.push_back(std::string() + sep); |
| 131 | } else { |
| 132 | // Build up the token with another character. |
| 133 | tok += c; |
| 134 | } |
| 135 | } |
| 136 | } |
| 137 | |
| 138 | if (!tok.empty()) { |
| 139 | new_token_list.push_back(tok); |
| 140 | } |
| 141 | |
| 142 | return TokenRange(std::move(new_token_list)); |
| 143 | } |
| 144 | |
| 145 | // A RandomAccessIterator to the first element in this range. |
| 146 | iterator begin() const { |
| 147 | return begin_; |
| 148 | } |
| 149 | |
| 150 | // A RandomAccessIterator to one past the last element in this range. |
| 151 | iterator end() const { |
| 152 | return end_; |
| 153 | } |
| 154 | |
| 155 | // The size of the range, i.e. how many tokens are in it. |
| 156 | size_t Size() const { |
| 157 | return std::distance(begin_, end_); |
| 158 | } |
| 159 | |
| 160 | // Are there 0 tokens in this range? |
| 161 | bool IsEmpty() const { |
| 162 | return Size() > 0; |
| 163 | } |
| 164 | |
| 165 | // Look up a token by it's offset. |
| 166 | const std::string& GetToken(size_t offset) const { |
| 167 | assert(offset < Size()); |
| 168 | return *(begin_ + offset); |
| 169 | } |
| 170 | |
| 171 | // Does this token range equal the other range? |
| 172 | // Equality is defined as having both the same size, and |
| 173 | // each corresponding token being equal. |
| 174 | bool operator==(const TokenRange& other) const { |
| 175 | if (this == &other) { |
| 176 | return true; |
| 177 | } |
| 178 | |
| 179 | if (Size() != other.Size()) { |
| 180 | return false; |
| 181 | } |
| 182 | |
| 183 | return std::equal(begin(), end(), other.begin()); |
| 184 | } |
| 185 | |
| 186 | // Look up the token at the requested index. |
| 187 | const std::string& operator[](int index) const { |
| 188 | assert(index >= 0 && static_cast<size_t>(index) < Size()); |
| 189 | return *(begin() + index); |
| 190 | } |
| 191 | |
| 192 | // Does this current range start with the other range? |
| 193 | bool StartsWith(const TokenRange& other) const { |
| 194 | if (this == &other) { |
| 195 | return true; |
| 196 | } |
| 197 | |
| 198 | if (Size() < other.Size()) { |
| 199 | return false; |
| 200 | } |
| 201 | |
| 202 | auto& smaller = Size() < other.Size() ? *this : other; |
| 203 | auto& greater = Size() < other.Size() ? other : *this; |
| 204 | |
| 205 | return std::equal(smaller.begin(), smaller.end(), greater.begin()); |
| 206 | } |
| 207 | |
| 208 | // Remove all characters 'c' from each token, potentially copying the underlying tokens. |
| 209 | TokenRange RemoveCharacter(char c) const { |
| 210 | TokenList new_token_list(begin(), end()); |
| 211 | |
| 212 | bool changed = false; |
| 213 | for (auto&& token : new_token_list) { |
| 214 | auto it = std::remove_if(token.begin(), token.end(), [&](char ch) { |
| 215 | if (ch == c) { |
| 216 | changed = true; |
| 217 | return true; |
| 218 | } |
| 219 | return false; |
| 220 | }); |
| 221 | token.erase(it, token.end()); |
| 222 | } |
| 223 | |
| 224 | if (!changed) { |
| 225 | return *this; |
| 226 | } |
| 227 | |
| 228 | return TokenRange(std::move(new_token_list)); |
| 229 | } |
| 230 | |
| 231 | // Remove all tokens matching this one, potentially copying the underlying tokens. |
| 232 | TokenRange RemoveToken(const std::string& token) { |
| 233 | return RemoveIf([&](const std::string& tok) { return tok == token; }); |
| 234 | } |
| 235 | |
| 236 | // Discard all empty tokens, potentially copying the underlying tokens. |
| 237 | TokenRange DiscardEmpty() const { |
| 238 | return RemoveIf([](const std::string& token) { return token.empty(); }); |
| 239 | } |
| 240 | |
| 241 | // Create a non-copying subset of this range. |
| 242 | // Length is trimmed so that the Slice does not go out of range. |
| 243 | TokenRange Slice(size_t offset, size_t length = std::string::npos) const { |
| 244 | assert(offset < Size()); |
| 245 | |
| 246 | if (length != std::string::npos && offset + length > Size()) { |
| 247 | length = Size() - offset; |
| 248 | } |
| 249 | |
| 250 | iterator it_end; |
| 251 | if (length == std::string::npos) { |
| 252 | it_end = end(); |
| 253 | } else { |
| 254 | it_end = begin() + offset + length; |
| 255 | } |
| 256 | |
| 257 | return TokenRange(token_list_, begin() + offset, it_end); |
| 258 | } |
| 259 | |
| 260 | // Try to match the string with tokens from this range. |
| 261 | // Each token is used to match exactly once (after which the next token is used, and so on). |
| 262 | // The matching happens from left-to-right in a non-greedy fashion. |
| 263 | // If the currently-matched token is the wildcard, then the new outputted token will |
| 264 | // contain as much as possible until the next token is matched. |
| 265 | // |
| 266 | // For example, if this == ["a:", "_", "b:] and "_" is the match string, then |
| 267 | // MatchSubstrings on "a:foob:" will yield: ["a:", "foo", "b:"] |
| 268 | // |
| 269 | // Since the string matching can fail (e.g. ["foo"] against "bar"), then this |
| 270 | // function can fail, in which cause it will return null. |
| 271 | std::unique_ptr<TokenRange> MatchSubstrings(const std::string& string, |
| 272 | const std::string& wildcard) const { |
| 273 | TokenList new_token_list; |
| 274 | |
| 275 | size_t wildcard_idx = std::string::npos; |
| 276 | size_t string_idx = 0; |
| 277 | |
| 278 | // Function to push all the characters matched as a wildcard so far |
| 279 | // as a brand new token. It resets the wildcard matching. |
| 280 | // Empty wildcards are possible and ok, but only if wildcard matching was on. |
| 281 | auto maybe_push_wildcard_token = [&]() { |
| 282 | if (wildcard_idx != std::string::npos) { |
| 283 | size_t wildcard_length = string_idx - wildcard_idx; |
| 284 | std::string wildcard_substr = string.substr(wildcard_idx, wildcard_length); |
| 285 | new_token_list.push_back(std::move(wildcard_substr)); |
| 286 | |
| 287 | wildcard_idx = std::string::npos; |
| 288 | } |
| 289 | }; |
| 290 | |
| 291 | for (iterator it = begin(); it != end(); ++it) { |
| 292 | const std::string& tok = *it; |
| 293 | |
| 294 | if (tok == wildcard) { |
| 295 | maybe_push_wildcard_token(); |
| 296 | wildcard_idx = string_idx; |
| 297 | continue; |
| 298 | } |
| 299 | |
| 300 | size_t next_token_idx = string.find(tok); |
| 301 | if (next_token_idx == std::string::npos) { |
| 302 | // Could not find token at all |
| 303 | return nullptr; |
| 304 | } else if (next_token_idx != string_idx && wildcard_idx == std::string::npos) { |
| 305 | // Found the token at a non-starting location, and we weren't |
| 306 | // trying to parse the wildcard. |
| 307 | return nullptr; |
| 308 | } |
| 309 | |
| 310 | new_token_list.push_back(string.substr(next_token_idx, tok.size())); |
| 311 | maybe_push_wildcard_token(); |
| 312 | string_idx += tok.size(); |
| 313 | } |
| 314 | |
| 315 | size_t remaining = string.size() - string_idx; |
| 316 | if (remaining > 0) { |
| 317 | if (wildcard_idx == std::string::npos) { |
| 318 | // Some characters were still remaining in the string, |
| 319 | // but it wasn't trying to match a wildcard. |
| 320 | return nullptr; |
| 321 | } |
| 322 | } |
| 323 | |
| 324 | // If some characters are remaining, the rest must be a wildcard. |
| 325 | string_idx += remaining; |
| 326 | maybe_push_wildcard_token(); |
| 327 | |
Yi Kong | c57c680 | 2018-10-29 14:28:56 -0700 | [diff] [blame^] | 328 | return std::make_unique<TokenRange>(std::move(new_token_list)); |
Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 329 | } |
| 330 | |
| 331 | // Do a quick match token-by-token, and see if they match. |
| 332 | // Any tokens with a wildcard in them are only matched up until the wildcard. |
| 333 | // If this is true, then the wildcard matching later on can still fail, so this is not |
| 334 | // a guarantee that the argument is correct, it's more of a strong hint that the |
| 335 | // user-provided input *probably* was trying to match this argument. |
| 336 | // |
| 337 | // Returns how many tokens were either matched (or ignored because there was a |
| 338 | // wildcard present). 0 means no match. If the size() tokens are returned. |
| 339 | size_t MaybeMatches(const TokenRange& token_list, const std::string& wildcard) const { |
| 340 | auto token_it = token_list.begin(); |
| 341 | auto token_end = token_list.end(); |
| 342 | auto name_it = begin(); |
| 343 | auto name_end = end(); |
| 344 | |
| 345 | size_t matched_tokens = 0; |
| 346 | |
| 347 | while (token_it != token_end && name_it != name_end) { |
| 348 | // Skip token matching when the corresponding name has a wildcard in it. |
| 349 | const std::string& name = *name_it; |
| 350 | |
| 351 | size_t wildcard_idx = name.find(wildcard); |
| 352 | if (wildcard_idx == std::string::npos) { // No wildcard present |
| 353 | // Did the definition token match the user token? |
| 354 | if (name != *token_it) { |
| 355 | return matched_tokens; |
| 356 | } |
| 357 | } else { |
| 358 | std::string name_prefix = name.substr(0, wildcard_idx); |
| 359 | |
| 360 | // Did the user token start with the up-to-the-wildcard prefix? |
| 361 | if (!StartsWith(*token_it, name_prefix)) { |
| 362 | return matched_tokens; |
| 363 | } |
| 364 | } |
| 365 | |
| 366 | ++token_it; |
| 367 | ++name_it; |
| 368 | ++matched_tokens; |
| 369 | } |
| 370 | |
| 371 | // If we got this far, it's either a full match or the token list was too short. |
| 372 | return matched_tokens; |
| 373 | } |
| 374 | |
| 375 | // Flatten the token range by joining every adjacent token with the separator character. |
| 376 | // e.g. ["hello", "world"].join('$') == "hello$world" |
| 377 | std::string Join(char separator) const { |
| 378 | TokenList tmp(begin(), end()); |
Andreas Gampe | 9186ced | 2016-12-12 14:28:21 -0800 | [diff] [blame] | 379 | return android::base::Join(tmp, separator); |
Igor Murashkin | aaebaa0 | 2015-01-26 10:55:53 -0800 | [diff] [blame] | 380 | // TODO: Join should probably take an offset or iterators |
| 381 | } |
| 382 | |
| 383 | private: |
| 384 | static bool StartsWith(const std::string& larger, const std::string& smaller) { |
| 385 | if (larger.size() >= smaller.size()) { |
| 386 | return std::equal(smaller.begin(), smaller.end(), larger.begin()); |
| 387 | } |
| 388 | |
| 389 | return false; |
| 390 | } |
| 391 | |
| 392 | template <typename TPredicate> |
| 393 | TokenRange RemoveIf(const TPredicate& predicate) const { |
| 394 | // If any of the tokens in the token lists are empty, then |
| 395 | // we need to remove them and compress the token list into a smaller one. |
| 396 | bool remove = false; |
| 397 | for (auto it = begin_; it != end_; ++it) { |
| 398 | auto&& token = *it; |
| 399 | |
| 400 | if (predicate(token)) { |
| 401 | remove = true; |
| 402 | break; |
| 403 | } |
| 404 | } |
| 405 | |
| 406 | // Actually copy the token list and remove the tokens that don't match our predicate. |
| 407 | if (remove) { |
| 408 | auto token_list = std::make_shared<TokenList>(begin(), end()); |
| 409 | TokenList::iterator new_end = |
| 410 | std::remove_if(token_list->begin(), token_list->end(), predicate); |
| 411 | token_list->erase(new_end, token_list->end()); |
| 412 | |
| 413 | assert(token_list_->size() > token_list->size() && "Nothing was actually removed!"); |
| 414 | |
| 415 | return TokenRange(token_list); |
| 416 | } |
| 417 | |
| 418 | return *this; |
| 419 | } |
| 420 | |
| 421 | const std::shared_ptr<std::vector<std::string>> token_list_; |
| 422 | const iterator begin_; |
| 423 | const iterator end_; |
| 424 | }; |
| 425 | } // namespace art |
| 426 | |
| 427 | #endif // ART_CMDLINE_TOKEN_RANGE_H_ |