blob: 8cb1532a381600f44fc5145ad89d506235300869 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2-*- linux-c -*-
3 drbd_receiver.c
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#ifndef _DRBD_VLI_H
26#define _DRBD_VLI_H
27
28/*
29 * At a granularity of 4KiB storage represented per bit,
30 * and stroage sizes of several TiB,
31 * and possibly small-bandwidth replication,
32 * the bitmap transfer time can take much too long,
33 * if transmitted in plain text.
34 *
Lucas De Marchi25985ed2011-03-30 22:57:33 -030035 * We try to reduce the transferred bitmap information
Philipp Reisnerb411b362009-09-25 16:07:19 -070036 * by encoding runlengths of bit polarity.
37 *
38 * We never actually need to encode a "zero" (runlengths are positive).
39 * But then we have to store the value of the first bit.
40 * The first bit of information thus shall encode if the first runlength
41 * gives the number of set or unset bits.
42 *
43 * We assume that large areas are either completely set or unset,
44 * which gives good compression with any runlength method,
45 * even when encoding the runlength as fixed size 32bit/64bit integers.
46 *
47 * Still, there may be areas where the polarity flips every few bits,
48 * and encoding the runlength sequence of those areas with fix size
49 * integers would be much worse than plaintext.
50 *
51 * We want to encode small runlength values with minimum code length,
52 * while still being able to encode a Huge run of all zeros.
53 *
54 * Thus we need a Variable Length Integer encoding, VLI.
55 *
56 * For some cases, we produce more code bits than plaintext input.
57 * We need to send incompressible chunks as plaintext, skip over them
58 * and then see if the next chunk compresses better.
59 *
60 * We don't care too much about "excellent" compression ratio for large
61 * runlengths (all set/all clear): whether we achieve a factor of 100
62 * or 1000 is not that much of an issue.
63 * We do not want to waste too much on short runlengths in the "noisy"
64 * parts of the bitmap, though.
65 *
66 * There are endless variants of VLI, we experimented with:
67 * * simple byte-based
68 * * various bit based with different code word length.
69 *
70 * To avoid yet an other configuration parameter (choice of bitmap compression
71 * algorithm) which was difficult to explain and tune, we just chose the one
72 * variant that turned out best in all test cases.
73 * Based on real world usage patterns, with device sizes ranging from a few GiB
74 * to several TiB, file server/mailserver/webserver/mysql/postgress,
75 * mostly idle to really busy, the all time winner (though sometimes only
76 * marginally better) is:
77 */
78
79/*
80 * encoding is "visualised" as
81 * __little endian__ bitstream, least significant bit first (left most)
82 *
83 * this particular encoding is chosen so that the prefix code
84 * starts as unary encoding the level, then modified so that
85 * 10 levels can be described in 8bit, with minimal overhead
86 * for the smaller levels.
87 *
88 * Number of data bits follow fibonacci sequence, with the exception of the
89 * last level (+1 data bit, so it makes 64bit total). The only worse code when
90 * encoding bit polarity runlength is 1 plain bits => 2 code bits.
91prefix data bits max val NÂș data bits
920 x 0x2 1
9310 x 0x4 1
94110 xx 0x8 2
951110 xxx 0x10 3
9611110 xxx xx 0x30 5
97111110 xx xxxxxx 0x130 8
9811111100 xxxxxxxx xxxxx 0x2130 13
9911111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
10011111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
10111111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
102 * maximum encodable value: 0x100000400202130 == 2**56 + some */
103
104/* compression "table":
105 transmitted x 0.29
106 as plaintext x ........................
107 x ........................
108 x ........................
109 x 0.59 0.21........................
110 x ........................................................
111 x .. c ...................................................
112 x 0.44.. o ...................................................
113 x .......... d ...................................................
114 x .......... e ...................................................
115 X............. ...................................................
116 x.............. b ...................................................
1172.0x............... i ...................................................
118 #X................ t ...................................................
119 #................. s ........................... plain bits ..........
120-+-----------------------------------------------------------------------
121 1 16 32 64
122*/
123
124/* LEVEL: (total bits, prefix bits, prefix value),
125 * sorted ascending by number of total bits.
126 * The rest of the code table is calculated at compiletime from this. */
127
128/* fibonacci data 1, 1, ... */
129#define VLI_L_1_1() do { \
130 LEVEL( 2, 1, 0x00); \
131 LEVEL( 3, 2, 0x01); \
132 LEVEL( 5, 3, 0x03); \
133 LEVEL( 7, 4, 0x07); \
134 LEVEL(10, 5, 0x0f); \
135 LEVEL(14, 6, 0x1f); \
136 LEVEL(21, 8, 0x3f); \
137 LEVEL(29, 8, 0x7f); \
138 LEVEL(42, 8, 0xbf); \
139 LEVEL(64, 8, 0xff); \
140 } while (0)
141
142/* finds a suitable level to decode the least significant part of in.
143 * returns number of bits consumed.
144 *
145 * BUG() for bad input, as that would mean a buggy code table. */
146static inline int vli_decode_bits(u64 *out, const u64 in)
147{
148 u64 adj = 1;
149
150#define LEVEL(t,b,v) \
151 do { \
152 if ((in & ((1 << b) -1)) == v) { \
153 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
154 return t; \
155 } \
156 adj += 1ULL << (t - b); \
157 } while (0)
158
159 VLI_L_1_1();
160
161 /* NOT REACHED, if VLI_LEVELS code table is defined properly */
162 BUG();
163#undef LEVEL
164}
165
166/* return number of code bits needed,
167 * or negative error number */
168static inline int __vli_encode_bits(u64 *out, const u64 in)
169{
170 u64 max = 0;
171 u64 adj = 1;
172
173 if (in == 0)
174 return -EINVAL;
175
176#define LEVEL(t,b,v) do { \
177 max += 1ULL << (t - b); \
178 if (in <= max) { \
179 if (out) \
180 *out = ((in - adj) << b) | v; \
181 return t; \
182 } \
183 adj = max + 1; \
184 } while (0)
185
186 VLI_L_1_1();
187
188 return -EOVERFLOW;
189#undef LEVEL
190}
191
192#undef VLI_L_1_1
193
194/* code from here down is independend of actually used bit code */
195
196/*
197 * Code length is determined by some unique (e.g. unary) prefix.
198 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
199 * not a byte stream.
200 */
201
202/* for the bitstream, we need a cursor */
203struct bitstream_cursor {
204 /* the current byte */
205 u8 *b;
206 /* the current bit within *b, nomalized: 0..7 */
207 unsigned int bit;
208};
209
210/* initialize cursor to point to first bit of stream */
211static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
212{
213 cur->b = s;
214 cur->bit = 0;
215}
216
217/* advance cursor by that many bits; maximum expected input value: 64,
218 * but depending on VLI implementation, it may be more. */
219static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
220{
221 bits += cur->bit;
222 cur->b = cur->b + (bits >> 3);
223 cur->bit = bits & 7;
224}
225
226/* the bitstream itself knows its length */
227struct bitstream {
228 struct bitstream_cursor cur;
229 unsigned char *buf;
230 size_t buf_len; /* in bytes */
231
232 /* for input stream:
233 * number of trailing 0 bits for padding
234 * total number of valid bits in stream: buf_len * 8 - pad_bits */
235 unsigned int pad_bits;
236};
237
238static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
239{
240 bs->buf = s;
241 bs->buf_len = len;
242 bs->pad_bits = pad_bits;
243 bitstream_cursor_reset(&bs->cur, bs->buf);
244}
245
246static inline void bitstream_rewind(struct bitstream *bs)
247{
248 bitstream_cursor_reset(&bs->cur, bs->buf);
249 memset(bs->buf, 0, bs->buf_len);
250}
251
252/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
253 * Ignores "pad_bits".
254 * Returns zero if bits == 0 (nothing to do).
255 * Returns number of bits used if successful.
256 *
257 * If there is not enough room left in bitstream,
258 * leaves bitstream unchanged and returns -ENOBUFS.
259 */
260static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
261{
262 unsigned char *b = bs->cur.b;
263 unsigned int tmp;
264
265 if (bits == 0)
266 return 0;
267
268 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
269 return -ENOBUFS;
270
271 /* paranoia: strip off hi bits; they should not be set anyways. */
272 if (bits < 64)
273 val &= ~0ULL >> (64 - bits);
274
275 *b++ |= (val & 0xff) << bs->cur.bit;
276
277 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
278 *b++ |= (val >> tmp) & 0xff;
279
280 bitstream_cursor_advance(&bs->cur, bits);
281 return bits;
282}
283
284/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
285 *
286 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
287 *
288 * If there are less than the requested number of valid bits left in the
289 * bitstream, still fetches all available bits.
290 *
291 * Returns number of actually fetched bits.
292 */
293static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
294{
295 u64 val;
296 unsigned int n;
297
298 if (bits > 64)
299 return -EINVAL;
300
301 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
302 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
303 - bs->cur.bit - bs->pad_bits;
304
305 if (bits == 0) {
306 *out = 0;
307 return 0;
308 }
309
310 /* get the high bits */
311 val = 0;
312 n = (bs->cur.bit + bits + 7) >> 3;
313 /* n may be at most 9, if cur.bit + bits > 64 */
314 /* which means this copies at most 8 byte */
315 if (n) {
316 memcpy(&val, bs->cur.b+1, n - 1);
317 val = le64_to_cpu(val) << (8 - bs->cur.bit);
318 }
319
320 /* we still need the low bits */
321 val |= bs->cur.b[0] >> bs->cur.bit;
322
323 /* and mask out bits we don't want */
324 val &= ~0ULL >> (64 - bits);
325
326 bitstream_cursor_advance(&bs->cur, bits);
327 *out = val;
328
329 return bits;
330}
331
332/* encodes @in as vli into @bs;
333
334 * return values
335 * > 0: number of bits successfully stored in bitstream
336 * -ENOBUFS @bs is full
337 * -EINVAL input zero (invalid)
338 * -EOVERFLOW input too large for this vli code (invalid)
339 */
340static inline int vli_encode_bits(struct bitstream *bs, u64 in)
341{
342 u64 code = code;
343 int bits = __vli_encode_bits(&code, in);
344
345 if (bits <= 0)
346 return bits;
347
348 return bitstream_put_bits(bs, code, bits);
349}
350
351#endif