blob: b02655ca4176dba31952106207827db7d360850d [file] [log] [blame]
Alex Deymo48ad5ab2017-09-13 22:17:57 +02001// Copyright 2017 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "bsdiff/suffix_array_index.h"
6
7#include <limits>
8#include <vector>
9
10#include <divsufsort.h>
11#include <divsufsort64.h>
12
13#include "bsdiff/logging.h"
14
Alex Deymo48ad5ab2017-09-13 22:17:57 +020015namespace {
16
17// libdivsufsort C++ overloaded functions used to allow calling the right
18// implementation based on the pointer size.
19int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
20 return divsufsort(text, sa, n);
21}
22int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
23 return divsufsort64(text, sa, n);
24}
25
26saidx_t CallSaSearch(const uint8_t* text,
27 size_t text_size,
28 const uint8_t* pattern,
29 size_t pattern_size,
30 const saidx_t* sa,
31 size_t sa_size,
32 saidx_t* left) {
33 return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
34}
35saidx64_t CallSaSearch(const uint8_t* text,
36 size_t text_size,
37 const uint8_t* pattern,
38 size_t pattern_size,
39 const saidx64_t* sa,
40 size_t sa_size,
41 saidx64_t* left) {
42 return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
43}
44
45} // namespace
46
47namespace bsdiff {
48
49// The SAIDX template type must be either saidx_t or saidx64_t, which will
50// depend on the maximum size of the input text needed.
51template <typename SAIDX>
52class SuffixArrayIndex : public SuffixArrayIndexInterface {
53 public:
54 SuffixArrayIndex() = default;
55
56 // Initialize and construct the suffix array of the |text| string of length
57 // |n|. The memory pointed by |text| must be kept alive. Returns whether the
58 // construction succeeded.
59 bool Init(const uint8_t* text, size_t n);
60
61 // SuffixArrayIndexInterface overrides.
62 void SearchPrefix(const uint8_t* target,
63 size_t length,
64 size_t* out_length,
65 uint64_t* out_pos) const override;
66
67 private:
68 const uint8_t* text_{nullptr}; // Owned by the caller.
69 size_t n_{0};
70
71 std::vector<SAIDX> sa_;
72};
73
74template <typename SAIDX>
75bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
76 if (!sa_.empty()) {
77 // Already initialized.
Tianjie Xu18480eb2017-11-29 16:21:43 -080078 LOG(ERROR) << "SuffixArray already initialized";
Alex Deymo48ad5ab2017-09-13 22:17:57 +020079 return false;
80 }
81 if (static_cast<uint64_t>(n) >
82 static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
Tianjie Xu18480eb2017-11-29 16:21:43 -080083 LOG(ERROR) << "Input too big (" << n << ") for this implementation";
Alex Deymo48ad5ab2017-09-13 22:17:57 +020084 return false;
85 }
86 text_ = text;
87 n_ = n;
88 sa_.resize(n + 1);
89
90 if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
Tianjie Xu18480eb2017-11-29 16:21:43 -080091 LOG(ERROR) << "divsufsrot() failed";
Alex Deymo48ad5ab2017-09-13 22:17:57 +020092 return false;
93 }
94
95 return true;
96}
97
98template <typename SAIDX>
99void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
100 size_t length,
101 size_t* out_length,
102 uint64_t* out_pos) const {
103 SAIDX suf_left;
104 SAIDX count =
105 CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
106 if (count > 0) {
107 // This is the simple case where we found the whole |target| string was
108 // found.
109 *out_pos = sa_[suf_left];
110 *out_length = length;
111 return;
112 }
113 // In this case, |suf_left| points to the first suffix array position such
114 // that the suffix at that position is lexicographically larger than |target|.
115 // We only need to check whether the previous entry or the current entry is a
116 // longer match.
117 size_t prev_suffix_len = 0;
118 if (suf_left > 0) {
119 const size_t prev_max_len =
120 std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
121 const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
122 prev_suffix_len =
123 std::mismatch(target, target + prev_max_len, prev_suffix).first -
124 target;
125 }
126
127 size_t next_suffix_len = 0;
128 if (static_cast<size_t>(suf_left) < n_) {
129 const uint8_t* next_suffix = text_ + sa_[suf_left];
130 const size_t next_max_len =
131 std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
132 next_suffix_len =
133 std::mismatch(target, target + next_max_len, next_suffix).first -
134 target;
135 }
136
137 *out_length = std::max(next_suffix_len, prev_suffix_len);
138 if (!*out_length) {
139 *out_pos = 0;
140 } else if (next_suffix_len >= prev_suffix_len) {
141 *out_pos = sa_[suf_left];
142 } else {
143 *out_pos = sa_[suf_left - 1];
144 }
145}
146
147std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
148 const uint8_t* text,
149 size_t n) {
150 // The maximum supported size when using the suffix array based on the 32-bit
151 // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
152 // than the maximum representable number so references like "n + 1" are don't
153 // overflow.
154 const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
155 std::unique_ptr<SuffixArrayIndexInterface> ret;
156
157 if (n > kMaxSaidxSize) {
158 SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
159 ret.reset(sa_ptr);
160 if (!sa_ptr->Init(text, n))
161 return nullptr;
162 } else {
163 SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
164 ret.reset(sa_ptr);
165 if (!sa_ptr->Init(text, n))
166 return nullptr;
167 }
168 return ret;
169}
170
171
172} // namespace bsdiff