blob: 00cfaba323c72df0fd1b81d6c17787c792f66e7c [file] [log] [blame]
Tianjie Xua5dcb7c2018-09-25 12:25:15 -07001#include <cstddef>
2#include <cstdio>
3#include <cstring>
4#include <fstream>
5#include <vector>
6
7#include "./deorummolae.h"
8#include "./durchschlag.h"
9#include "./sieve.h"
10
11#define METHOD_DM 0
12#define METHOD_SIEVE 1
13#define METHOD_DURCHSCHLAG 2
14#define METHOD_DISTILL 3
15#define METHOD_PURIFY 4
16
17static size_t readInt(const char* str) {
18 size_t result = 0;
19 if (str[0] == 0 || str[0] == '0') {
20 return 0;
21 }
22 for (size_t i = 0; i < 13; ++i) {
23 if (str[i] == 0) {
24 return result;
25 }
26 if (str[i] == 'k' || str[i] == 'K') {
27 if ((str[i + 1] == 0) && ((result << 10) > result)) {
28 return result << 10;
29 }
30 return 0;
31 }
32 if (str[i] == 'm' || str[i] == 'M') {
33 if ((str[i + 1] == 0) && ((result << 20) > result)) {
34 return result << 20;
35 }
36 return 0;
37 }
38 if (str[i] < '0' || str[i] > '9') {
39 return 0;
40 }
41 size_t next = (10 * result) + (str[i] - '0');
42 if (next <= result) {
43 return 0;
44 }
45 result = next;
46 }
47 return 0;
48}
49
50static std::string readFile(const std::string& path) {
51 std::ifstream file(path);
52 std::string content(
53 (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
54 return content;
55}
56
57static void writeFile(const char* file, const std::string& content) {
58 std::ofstream outfile(file, std::ofstream::binary);
59 outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
60 outfile.close();
61}
62
63static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
64 const std::vector<size_t>& sizes, const uint8_t* data) {
65 size_t offset = 0;
66 for (size_t i = 0; i < pathArgs.size(); ++i) {
67 int j = pathArgs[i];
68 const char* file = argv[j];
69 size_t sampleSize = sizes[i];
70 std::ofstream outfile(file, std::ofstream::binary);
71 outfile.write(reinterpret_cast<const char*>(data + offset),
72 static_cast<std::streamsize>(sampleSize));
73 outfile.close();
74 offset += sampleSize;
75 }
76}
77
78/* Returns "base file name" or its tail, if it contains '/' or '\'. */
79static const char* fileName(const char* path) {
80 const char* separator_position = strrchr(path, '/');
81 if (separator_position) path = separator_position + 1;
82 separator_position = strrchr(path, '\\');
83 if (separator_position) path = separator_position + 1;
84 return path;
85}
86
87static void printHelp(const char* name) {
88 fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
89 fprintf(stderr,
90 "Options:\n"
91 " --dm use 'deorummolae' engine\n"
92 " --distill rewrite samples; unique text parts are removed\n"
93 " --dsh use 'durchschlag' engine (default)\n"
94 " --purify rewrite samples; unique text parts are zeroed out\n"
95 " --sieve use 'sieve' engine\n"
96 " -b# set block length for 'durchschlag'; default: 1024\n"
97 " -s# set slice length for 'distill', 'durchschlag', 'purify'\n"
98 " and 'sieve'; default: 16\n"
99 " -t# set target dictionary size (limit); default: 16K\n"
100 " -u# set minimum slice population (for rewrites); default: 2\n"
101 "# is a decimal number with optional k/K/m/M suffix.\n"
102 "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
103 " Completely unique samples might become empty files.\n\n");
104}
105
106int main(int argc, char const* argv[]) {
107 int dictionaryArg = -1;
108 int method = METHOD_DURCHSCHLAG;
109 size_t sliceLen = 16;
110 size_t targetSize = 16 << 10;
111 size_t blockSize = 1024;
112 size_t minimumPopulation = 2;
113
114 std::vector<uint8_t> data;
115 std::vector<size_t> sizes;
116 std::vector<int> pathArgs;
117 size_t total = 0;
118 for (int i = 1; i < argc; ++i) {
119 if (argv[i] == nullptr) {
120 continue;
121 }
122 if (argv[i][0] == '-') {
123 if (argv[i][1] == '-') {
124 if (dictionaryArg != -1) {
125 fprintf(stderr,
126 "Method should be specified before dictionary / sample '%s'\n",
127 argv[i]);
128 exit(1);
129 }
130 if (std::strcmp("--sieve", argv[i]) == 0) {
131 method = METHOD_SIEVE;
132 continue;
133 }
134 if (std::strcmp("--dm", argv[i]) == 0) {
135 method = METHOD_DM;
136 continue;
137 }
138 if (std::strcmp("--dsh", argv[i]) == 0) {
139 method = METHOD_DURCHSCHLAG;
140 continue;
141 }
142 if (std::strcmp("--distill", argv[i]) == 0) {
143 method = METHOD_DISTILL;
144 continue;
145 }
146 if (std::strcmp("--purify", argv[i]) == 0) {
147 method = METHOD_PURIFY;
148 continue;
149 }
150 printHelp(fileName(argv[0]));
151 fprintf(stderr, "Invalid option '%s'\n", argv[i]);
152 exit(1);
153 }
154 if (argv[i][1] == 'b') {
155 blockSize = readInt(&argv[i][2]);
156 if (blockSize < 16 || blockSize > 65536) {
157 printHelp(fileName(argv[0]));
158 fprintf(stderr, "Invalid option '%s'\n", argv[i]);
159 exit(1);
160 }
161 } else if (argv[i][1] == 's') {
162 sliceLen = readInt(&argv[i][2]);
163 if (sliceLen < 4 || sliceLen > 256) {
164 printHelp(fileName(argv[0]));
165 fprintf(stderr, "Invalid option '%s'\n", argv[i]);
166 exit(1);
167 }
168 } else if (argv[i][1] == 't') {
169 targetSize = readInt(&argv[i][2]);
170 if (targetSize < 256 || targetSize > (1 << 25)) {
171 printHelp(fileName(argv[0]));
172 fprintf(stderr, "Invalid option '%s'\n", argv[i]);
173 exit(1);
174 }
175 } else if (argv[i][1] == 'u') {
176 minimumPopulation = readInt(&argv[i][2]);
177 if (minimumPopulation < 256 || minimumPopulation > 65536) {
178 printHelp(fileName(argv[0]));
179 fprintf(stderr, "Invalid option '%s'\n", argv[i]);
180 exit(1);
181 }
182 } else {
183 printHelp(fileName(argv[0]));
184 fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
185 exit(1);
186 }
187 continue;
188 }
189 if (dictionaryArg == -1) {
190 if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
191 dictionaryArg = i;
192 continue;
193 }
194 }
195 std::string content = readFile(argv[i]);
196 data.insert(data.end(), content.begin(), content.end());
197 total += content.size();
198 pathArgs.push_back(i);
199 sizes.push_back(content.size());
200 }
201 bool wantDictionary = (dictionaryArg == -1);
202 if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
203 wantDictionary = false;
204 }
205 if (wantDictionary || total == 0) {
206 printHelp(fileName(argv[0]));
207 fprintf(stderr, "Not enough arguments\n");
208 exit(1);
209 }
210
211 if (method == METHOD_SIEVE) {
212 writeFile(argv[dictionaryArg], sieve_generate(
213 targetSize, sliceLen, sizes, data.data()));
214 } else if (method == METHOD_DM) {
215 writeFile(argv[dictionaryArg], DM_generate(
216 targetSize, sizes, data.data()));
217 } else if (method == METHOD_DURCHSCHLAG) {
218 writeFile(argv[dictionaryArg], durchschlag_generate(
219 targetSize, sliceLen, blockSize, sizes, data.data()));
220 } else if (method == METHOD_DISTILL) {
221 durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
222 writeSamples(argv, pathArgs, sizes, data.data());
223 } else if (method == METHOD_PURIFY) {
224 durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
225 writeSamples(argv, pathArgs, sizes, data.data());
226 } else {
227 printHelp(fileName(argv[0]));
228 fprintf(stderr, "Unknown generator\n");
229 exit(1);
230 }
231 return 0;
232}