Tianjie Xu | a5dcb7c | 2018-09-25 12:25:15 -0700 | [diff] [blame] | 1 | #include "./sieve.h" |
| 2 | |
| 3 | /* Pointer to position in (combined corpus) text. */ |
| 4 | typedef uint32_t TextIdx; |
| 5 | |
| 6 | /* Index of sample / generation. */ |
| 7 | typedef uint16_t SampleIdx; |
| 8 | |
| 9 | typedef struct Slot { |
| 10 | TextIdx next; |
| 11 | TextIdx offset; |
| 12 | SampleIdx presence; |
| 13 | SampleIdx mark; |
| 14 | } Slot; |
| 15 | |
| 16 | static const TextIdx kNowhere = static_cast<TextIdx>(-1); |
| 17 | |
| 18 | static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut, |
| 19 | TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) { |
| 20 | TextIdx from = kNowhere; |
| 21 | TextIdx to = kNowhere; |
| 22 | TextIdx result = 0; |
| 23 | SampleIdx targetPresence = minPresence; |
| 24 | for (TextIdx i = 0; i < end; ++i) { |
| 25 | if (i == middle) { |
| 26 | targetPresence++; |
| 27 | } |
| 28 | Slot& item = map[shortcut[i]]; |
| 29 | if (item.mark != iteration) { |
| 30 | item.mark = iteration; |
| 31 | if (item.presence >= targetPresence) { |
| 32 | if ((to == kNowhere) || (to < i)) { |
| 33 | if (from != kNowhere) { |
| 34 | result += to - from; |
| 35 | } |
| 36 | from = i; |
| 37 | } |
| 38 | to = i + sliceLen; |
| 39 | } |
| 40 | } |
| 41 | } |
| 42 | if (from != kNowhere) { |
| 43 | result += to - from; |
| 44 | } |
| 45 | return result; |
| 46 | } |
| 47 | |
| 48 | static std::string createDictionary(const uint8_t* data, TextIdx sliceLen, |
| 49 | Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle, |
| 50 | SampleIdx minPresence, SampleIdx iteration) { |
| 51 | std::string output; |
| 52 | TextIdx from = kNowhere; |
| 53 | TextIdx to = kNowhere; |
| 54 | SampleIdx targetPresence = minPresence; |
| 55 | for (TextIdx i = 0; i < end; ++i) { |
| 56 | if (i == middle) { |
| 57 | targetPresence++; |
| 58 | } |
| 59 | Slot& item = map[shortcut[i]]; |
| 60 | if (item.mark != iteration) { |
| 61 | item.mark = iteration; |
| 62 | if (item.presence >= targetPresence) { |
| 63 | if ((to == kNowhere) || (to < i)) { |
| 64 | if (from != kNowhere) { |
| 65 | output.insert(output.end(), &data[from], &data[to]); |
| 66 | } |
| 67 | from = i; |
| 68 | } |
| 69 | to = i + sliceLen; |
| 70 | } |
| 71 | } |
| 72 | } |
| 73 | if (from != kNowhere) { |
| 74 | output.insert(output.end(), &data[from], &data[to]); |
| 75 | } |
| 76 | return output; |
| 77 | } |
| 78 | |
| 79 | std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, |
| 80 | const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { |
| 81 | /* Parameters aliasing */ |
| 82 | TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit); |
| 83 | if (targetSize != dictionary_size_limit) { |
| 84 | fprintf(stderr, "dictionary_size_limit is too large\n"); |
| 85 | return ""; |
| 86 | } |
| 87 | TextIdx sliceLen = static_cast<TextIdx>(slice_len); |
| 88 | if (sliceLen != slice_len) { |
| 89 | fprintf(stderr, "slice_len is too large\n"); |
| 90 | return ""; |
| 91 | } |
| 92 | if (sliceLen < 1) { |
| 93 | fprintf(stderr, "slice_len is too small\n"); |
| 94 | return ""; |
| 95 | } |
| 96 | SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size()); |
| 97 | if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) { |
| 98 | fprintf(stderr, "too many samples\n"); |
| 99 | return ""; |
| 100 | } |
| 101 | const uint8_t* data = sample_data; |
| 102 | |
| 103 | TextIdx total = 0; |
| 104 | std::vector<TextIdx> offsets; |
| 105 | for (SampleIdx i = 0; i < numSamples; ++i) { |
| 106 | TextIdx delta = static_cast<TextIdx>(sample_sizes[i]); |
| 107 | if (delta != sample_sizes[i]) { |
| 108 | fprintf(stderr, "sample is too large\n"); |
| 109 | return ""; |
| 110 | } |
| 111 | if (delta == 0) { |
| 112 | fprintf(stderr, "empty samples are prohibited\n"); |
| 113 | return ""; |
| 114 | } |
| 115 | if (total + delta <= total) { |
| 116 | fprintf(stderr, "corpus is too large\n"); |
| 117 | return ""; |
| 118 | } |
| 119 | total += delta; |
| 120 | offsets.push_back(total); |
| 121 | } |
| 122 | |
| 123 | if (total * 2 < total) { |
| 124 | fprintf(stderr, "corpus is too large\n"); |
| 125 | return ""; |
| 126 | } |
| 127 | |
| 128 | if (total < sliceLen) { |
| 129 | fprintf(stderr, "slice_len is larger than corpus size\n"); |
| 130 | return ""; |
| 131 | } |
| 132 | |
| 133 | /***************************************************************************** |
| 134 | * Build coverage map. |
| 135 | ****************************************************************************/ |
| 136 | std::vector<Slot> map; |
| 137 | std::vector<TextIdx> shortcut; |
| 138 | map.push_back({0, 0, 0, 0}); |
| 139 | TextIdx end = total - sliceLen; |
| 140 | TextIdx hashLen = 11; |
| 141 | while (hashLen < 29 && ((1u << hashLen) < end)) { |
| 142 | hashLen += 3; |
| 143 | } |
| 144 | hashLen -= 3; |
| 145 | TextIdx hashMask = (1u << hashLen) - 1u; |
| 146 | std::vector<TextIdx> hashHead(1 << hashLen); |
| 147 | TextIdx hashSlot = 1; |
| 148 | SampleIdx piece = 0; |
| 149 | TextIdx hash = 0; |
| 150 | TextIdx lShift = 3; |
| 151 | TextIdx rShift = hashLen - lShift; |
| 152 | for (TextIdx i = 0; i < sliceLen - 1; ++i) { |
| 153 | TextIdx v = data[i]; |
| 154 | hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; |
| 155 | } |
| 156 | TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; |
| 157 | TextIdx rShiftX = hashLen - lShiftX; |
| 158 | for (TextIdx i = 0; i < end; ++i) { |
| 159 | TextIdx v = data[i + sliceLen - 1]; |
| 160 | hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; |
| 161 | |
| 162 | if (offsets[piece] == i) { |
| 163 | piece++; |
| 164 | } |
| 165 | TextIdx slot = hashHead[hash]; |
| 166 | while (slot != 0) { |
| 167 | Slot& item = map[slot]; |
| 168 | TextIdx start = item.offset; |
| 169 | bool miss = false; |
| 170 | for (TextIdx j = 0; j < sliceLen; ++j) { |
| 171 | if (data[i + j] != data[start + j]) { |
| 172 | miss = true; |
| 173 | break; |
| 174 | } |
| 175 | } |
| 176 | if (!miss) { |
| 177 | if (item.mark != piece) { |
| 178 | item.mark = piece; |
| 179 | item.presence++; |
| 180 | } |
| 181 | shortcut.push_back(slot); |
| 182 | break; |
| 183 | } |
| 184 | slot = item.next; |
| 185 | } |
| 186 | if (slot == 0) { |
| 187 | map.push_back({hashHead[hash], i, 1, piece}); |
| 188 | hashHead[hash] = hashSlot; |
| 189 | shortcut.push_back(hashSlot); |
| 190 | hashSlot++; |
| 191 | } |
| 192 | v = data[i]; |
| 193 | hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask; |
| 194 | } |
| 195 | |
| 196 | /***************************************************************************** |
| 197 | * Build dictionary of specified size. |
| 198 | ****************************************************************************/ |
| 199 | SampleIdx a = 1; |
| 200 | TextIdx size = dryRun( |
| 201 | sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); |
| 202 | /* Maximal output is smaller than target. */ |
| 203 | if (size <= targetSize) { |
| 204 | return createDictionary( |
| 205 | data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); |
| 206 | } |
| 207 | |
| 208 | SampleIdx b = numSamples; |
| 209 | size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); |
| 210 | if (size == targetSize) { |
| 211 | return createDictionary( |
| 212 | data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); |
| 213 | } |
| 214 | /* Run binary search. */ |
| 215 | if (size < targetSize) { |
| 216 | /* size(a) > targetSize > size(b) && a < m < b */ |
| 217 | while (a + 1 < b) { |
| 218 | SampleIdx m = static_cast<SampleIdx>((a + b) / 2); |
| 219 | size = dryRun( |
| 220 | sliceLen, map.data(), shortcut.data(), end, end, m, ++piece); |
| 221 | if (size < targetSize) { |
| 222 | b = m; |
| 223 | } else if (size > targetSize) { |
| 224 | a = m; |
| 225 | } else { |
| 226 | return createDictionary( |
| 227 | data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); |
| 228 | } |
| 229 | } |
| 230 | } else { |
| 231 | a = b; |
| 232 | } |
| 233 | /* size(minPresence) > targetSize > size(minPresence + 1) */ |
| 234 | SampleIdx minPresence = a; |
| 235 | TextIdx c = 0; |
| 236 | TextIdx d = end; |
| 237 | /* size(a) < targetSize < size(b) && a < m < b */ |
| 238 | while (c + 1 < d) { |
| 239 | TextIdx m = (c + d) / 2; |
| 240 | size = dryRun( |
| 241 | sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); |
| 242 | if (size < targetSize) { |
| 243 | c = m; |
| 244 | } else if (size > targetSize) { |
| 245 | d = m; |
| 246 | } else { |
| 247 | return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, |
| 248 | m, minPresence, ++piece); |
| 249 | } |
| 250 | } |
| 251 | |
| 252 | bool unrestricted = false; |
| 253 | if (minPresence <= 2 && !unrestricted) { |
| 254 | minPresence = 2; |
| 255 | c = end; |
| 256 | } |
| 257 | return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c, |
| 258 | minPresence, ++piece); |
| 259 | } |