| // Copyright 2017 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "bsdiff/suffix_array_index.h" |
| |
| #include <limits> |
| #include <vector> |
| |
| #include <divsufsort.h> |
| #include <divsufsort64.h> |
| |
| #include "bsdiff/logging.h" |
| |
| namespace { |
| |
| // libdivsufsort C++ overloaded functions used to allow calling the right |
| // implementation based on the pointer size. |
| int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) { |
| return divsufsort(text, sa, n); |
| } |
| int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) { |
| return divsufsort64(text, sa, n); |
| } |
| |
| saidx_t CallSaSearch(const uint8_t* text, |
| size_t text_size, |
| const uint8_t* pattern, |
| size_t pattern_size, |
| const saidx_t* sa, |
| size_t sa_size, |
| saidx_t* left) { |
| return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left); |
| } |
| saidx64_t CallSaSearch(const uint8_t* text, |
| size_t text_size, |
| const uint8_t* pattern, |
| size_t pattern_size, |
| const saidx64_t* sa, |
| size_t sa_size, |
| saidx64_t* left) { |
| return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left); |
| } |
| |
| } // namespace |
| |
| namespace bsdiff { |
| |
| // The SAIDX template type must be either saidx_t or saidx64_t, which will |
| // depend on the maximum size of the input text needed. |
| template <typename SAIDX> |
| class SuffixArrayIndex : public SuffixArrayIndexInterface { |
| public: |
| SuffixArrayIndex() = default; |
| |
| // Initialize and construct the suffix array of the |text| string of length |
| // |n|. The memory pointed by |text| must be kept alive. Returns whether the |
| // construction succeeded. |
| bool Init(const uint8_t* text, size_t n); |
| |
| // SuffixArrayIndexInterface overrides. |
| void SearchPrefix(const uint8_t* target, |
| size_t length, |
| size_t* out_length, |
| uint64_t* out_pos) const override; |
| |
| private: |
| const uint8_t* text_{nullptr}; // Owned by the caller. |
| size_t n_{0}; |
| |
| std::vector<SAIDX> sa_; |
| }; |
| |
| template <typename SAIDX> |
| bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) { |
| if (!sa_.empty()) { |
| // Already initialized. |
| LOG(ERROR) << "SuffixArray already initialized"; |
| return false; |
| } |
| if (static_cast<uint64_t>(n) > |
| static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) { |
| LOG(ERROR) << "Input too big (" << n << ") for this implementation"; |
| return false; |
| } |
| text_ = text; |
| n_ = n; |
| sa_.resize(n + 1); |
| |
| if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) { |
| LOG(ERROR) << "divsufsrot() failed"; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| template <typename SAIDX> |
| void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target, |
| size_t length, |
| size_t* out_length, |
| uint64_t* out_pos) const { |
| SAIDX suf_left; |
| SAIDX count = |
| CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left); |
| if (count > 0) { |
| // This is the simple case where we found the whole |target| string was |
| // found. |
| *out_pos = sa_[suf_left]; |
| *out_length = length; |
| return; |
| } |
| // In this case, |suf_left| points to the first suffix array position such |
| // that the suffix at that position is lexicographically larger than |target|. |
| // We only need to check whether the previous entry or the current entry is a |
| // longer match. |
| size_t prev_suffix_len = 0; |
| if (suf_left > 0) { |
| const size_t prev_max_len = |
| std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length); |
| const uint8_t* prev_suffix = text_ + sa_[suf_left - 1]; |
| prev_suffix_len = |
| std::mismatch(target, target + prev_max_len, prev_suffix).first - |
| target; |
| } |
| |
| size_t next_suffix_len = 0; |
| if (static_cast<size_t>(suf_left) < n_) { |
| const uint8_t* next_suffix = text_ + sa_[suf_left]; |
| const size_t next_max_len = |
| std::min(n_ - static_cast<size_t>(sa_[suf_left]), length); |
| next_suffix_len = |
| std::mismatch(target, target + next_max_len, next_suffix).first - |
| target; |
| } |
| |
| *out_length = std::max(next_suffix_len, prev_suffix_len); |
| if (!*out_length) { |
| *out_pos = 0; |
| } else if (next_suffix_len >= prev_suffix_len) { |
| *out_pos = sa_[suf_left]; |
| } else { |
| *out_pos = sa_[suf_left - 1]; |
| } |
| } |
| |
| std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex( |
| const uint8_t* text, |
| size_t n) { |
| // The maximum supported size when using the suffix array based on the 32-bit |
| // saidx_t type. We limit this to something a bit smaller (16 bytes smaller) |
| // than the maximum representable number so references like "n + 1" are don't |
| // overflow. |
| const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16; |
| std::unique_ptr<SuffixArrayIndexInterface> ret; |
| |
| if (n > kMaxSaidxSize) { |
| SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>(); |
| ret.reset(sa_ptr); |
| if (!sa_ptr->Init(text, n)) |
| return nullptr; |
| } else { |
| SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>(); |
| ret.reset(sa_ptr); |
| if (!sa_ptr->Init(text, n)) |
| return nullptr; |
| } |
| return ret; |
| } |
| |
| |
| } // namespace bsdiff |