Eugene Kliuchnikov | 35e69fc | 2018-02-26 09:04:36 -0500 | [diff] [blame] | 1 | #ifndef BROTLI_RESEARCH_DURCHSCHLAG_H_ |
| 2 | #define BROTLI_RESEARCH_DURCHSCHLAG_H_ |
| 3 | |
| 4 | #include <cstddef> |
| 5 | #include <cstdint> |
| 6 | #include <string> |
| 7 | #include <vector> |
| 8 | |
| 9 | /** |
| 10 | * Generate a dictionary for given samples. |
| 11 | * |
| 12 | * @param dictionary_size_limit maximal dictionary size |
| 13 | * @param slice_len text slice size |
| 14 | * @param block_len score block length |
| 15 | * @param sample_sizes vector with sample sizes |
| 16 | * @param sample_data concatenated samples |
| 17 | * @return generated dictionary |
| 18 | */ |
| 19 | std::string durchschlag_generate( |
| 20 | size_t dictionary_size_limit, size_t slice_len, size_t block_len, |
| 21 | const std::vector<size_t>& sample_sizes, const uint8_t* sample_data); |
| 22 | |
| 23 | //------------------------------------------------------------------------------ |
| 24 | // Lower level API for repetitive dictionary generation. |
| 25 | //------------------------------------------------------------------------------ |
| 26 | |
| 27 | /* Pointer to position in text. */ |
| 28 | typedef uint32_t DurchschlagTextIdx; |
| 29 | |
| 30 | /* Context is made public for flexible serialization / deserialization. */ |
| 31 | typedef struct DurchschlagContext { |
| 32 | DurchschlagTextIdx dataSize; |
| 33 | DurchschlagTextIdx sliceLen; |
| 34 | DurchschlagTextIdx numUniqueSlices; |
| 35 | std::vector<DurchschlagTextIdx> offsets; |
| 36 | std::vector<DurchschlagTextIdx> sliceMap; |
| 37 | } DurchschlagContext; |
| 38 | |
| 39 | DurchschlagContext durchschlag_prepare(size_t slice_len, |
| 40 | const std::vector<size_t>& sample_sizes, const uint8_t* sample_data); |
| 41 | |
| 42 | typedef enum DurchschalgResourceStrategy { |
| 43 | // Faster |
| 44 | DURCHSCHLAG_EXCLUSIVE = 0, |
| 45 | // Uses much less memory |
| 46 | DURCHSCHLAG_COLLABORATIVE = 1 |
| 47 | } DurchschalgResourceStrategy; |
| 48 | |
| 49 | std::string durchschlag_generate(DurchschalgResourceStrategy strategy, |
| 50 | size_t dictionary_size_limit, size_t block_len, |
| 51 | const DurchschlagContext& context, const uint8_t* sample_data); |
| 52 | |
| 53 | //------------------------------------------------------------------------------ |
| 54 | // Suffix Array based preparation. |
| 55 | //------------------------------------------------------------------------------ |
| 56 | |
| 57 | typedef struct DurchschlagIndex { |
| 58 | std::vector<DurchschlagTextIdx> lcp; |
| 59 | std::vector<DurchschlagTextIdx> sa; |
| 60 | } DurchschlagIndex; |
| 61 | |
| 62 | DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data); |
| 63 | |
| 64 | DurchschlagContext durchschlag_prepare(size_t slice_len, |
| 65 | const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index); |
| 66 | |
| 67 | //------------------------------------------------------------------------------ |
| 68 | // Data preparation. |
| 69 | //------------------------------------------------------------------------------ |
| 70 | |
| 71 | /** |
| 72 | * Cut out unique slices. |
| 73 | * |
| 74 | * Both @p sample_sizes and @p sample_data are modified in-place. Number of |
| 75 | * samples remains unchanged, but some samples become shorter. |
| 76 | * |
| 77 | * @param slice_len (unique) slice size |
| 78 | * @param minimum_population minimum non-unique slice occurrence |
| 79 | * @param sample_sizes [in / out] vector with sample sizes |
| 80 | * @param sample_data [in / out] concatenated samples |
| 81 | */ |
| 82 | void durchschlag_distill(size_t slice_len, size_t minimum_population, |
| 83 | std::vector<size_t>* sample_sizes, uint8_t* sample_data); |
| 84 | |
| 85 | /** |
| 86 | * Replace unique slices with zeroes. |
| 87 | * |
| 88 | * @p sample_data is modified in-place. Number of samples and their length |
| 89 | * remain unchanged. |
| 90 | * |
| 91 | * @param slice_len (unique) slice size |
| 92 | * @param minimum_population minimum non-unique slice occurrence |
| 93 | * @param sample_sizes vector with sample sizes |
| 94 | * @param sample_data [in / out] concatenated samples |
| 95 | */ |
| 96 | void durchschlag_purify(size_t slice_len, size_t minimum_population, |
| 97 | const std::vector<size_t>& sample_sizes, uint8_t* sample_data); |
| 98 | |
| 99 | #endif // BROTLI_RESEARCH_DURCHSCHLAG_H_ |