blob: f860aa64c99a5f58be9e2a0e4d8bda160dada748 [file] [log] [blame]
Rob Landley6973a1d2006-11-25 16:50:00 -05001/* vi: set sw=4 ts=4: */
2/* micro-bunzip, a small, simple bzip2 decompression implementation.
3
4 Copyright 2003, 2006 by Rob Landley (rob@landley.net).
5
Rob Landley636b5cf2007-01-16 13:26:02 -05006 Based on a close reading (but not the actual code) of the original bzip2
7 decompression code by Julian R Seward (jseward@acm.org), which also
8 acknowledges contributions by Mike Burrows, David Wheeler, Peter Fenwick,
9 Alistair Moffat, Radford Neal, Ian H. Witten, Robert Sedgewick, and
10 Jon L. Bentley.
Rob Landley6973a1d2006-11-25 16:50:00 -050011*/
12
13#include "toys.h"
14
Rob Landleyd3236c12007-12-24 19:53:20 -060015#define THREADS 1
16
Rob Landley2de8f3e2006-11-26 17:19:18 -050017// Constants for huffman coding
Rob Landleyd3236c12007-12-24 19:53:20 -060018#define MAX_GROUPS 6
19#define GROUP_SIZE 50 /* 64 would have been more efficient */
20#define MAX_HUFCODE_BITS 20 /* Longest huffman code allowed */
21#define MAX_SYMBOLS 258 /* 256 literals + RUNA + RUNB */
22#define SYMBOL_RUNA 0
23#define SYMBOL_RUNB 1
Rob Landley6973a1d2006-11-25 16:50:00 -050024
Rob Landley2de8f3e2006-11-26 17:19:18 -050025// Other housekeeping constants
Rob Landleyd3236c12007-12-24 19:53:20 -060026#define IOBUF_SIZE 4096
Rob Landley2de8f3e2006-11-26 17:19:18 -050027
28// Status return values
Rob Landleyd3236c12007-12-24 19:53:20 -060029#define RETVAL_LAST_BLOCK (-100)
30#define RETVAL_NOT_BZIP_DATA (-1)
31#define RETVAL_DATA_ERROR (-2)
32#define RETVAL_OBSOLETE_INPUT (-3)
Rob Landley6973a1d2006-11-25 16:50:00 -050033
Rob Landley2de8f3e2006-11-26 17:19:18 -050034char *bunzip_errors[]={
35 NULL,
Rob Landley2de8f3e2006-11-26 17:19:18 -050036 "Not bzip data",
Rob Landley2de8f3e2006-11-26 17:19:18 -050037 "Data error",
Rob Landley2de8f3e2006-11-26 17:19:18 -050038 "Obsolete (pre 0.9.5) bzip format not supported."
39};
Rob Landley6973a1d2006-11-25 16:50:00 -050040
Rob Landley2de8f3e2006-11-26 17:19:18 -050041// This is what we know about each huffman coding group
Rob Landley6973a1d2006-11-25 16:50:00 -050042struct group_data {
Rob Landley813840c2007-01-18 18:16:11 -050043 int limit[MAX_HUFCODE_BITS+1], base[MAX_HUFCODE_BITS], permute[MAX_SYMBOLS];
Rob Landley6973a1d2006-11-25 16:50:00 -050044 char minLen, maxLen;
45};
46
Rob Landleyd3236c12007-12-24 19:53:20 -060047// Data for burrows wheeler transform
48
49struct bwdata {
50 unsigned int origPtr;
51 int byteCount[256];
52 // State saved when interrupting output
53 int writePos, writeRun, writeCount, writeCurrent;
54 unsigned int dataCRC, headerCRC;
55 unsigned int *dbuf;
56};
57
Rob Landley2de8f3e2006-11-26 17:19:18 -050058// Structure holding all the housekeeping data, including IO buffers and
59// memory that persists between calls to bunzip
Rob Landleyd3236c12007-12-24 19:53:20 -060060struct bunzip_data {
Rob Landley2de8f3e2006-11-26 17:19:18 -050061
Rob Landley2de8f3e2006-11-26 17:19:18 -050062 // Input stream, input buffer, input bit buffer
Rob Landley46968f72006-12-31 19:10:24 -050063 int in_fd, inbufCount, inbufPos;
64 char *inbuf;
Rob Landley6973a1d2006-11-25 16:50:00 -050065 unsigned int inbufBitCount, inbufBits;
Rob Landley2de8f3e2006-11-26 17:19:18 -050066
67 // Output buffer
Rob Landley6973a1d2006-11-25 16:50:00 -050068 char outbuf[IOBUF_SIZE];
69 int outbufPos;
Rob Landley6973a1d2006-11-25 16:50:00 -050070
Rob Landleyd3236c12007-12-24 19:53:20 -060071 unsigned int totalCRC;
Rob Landley2de8f3e2006-11-26 17:19:18 -050072
Rob Landleyd3236c12007-12-24 19:53:20 -060073 // First pass decompression data (Huffman and MTF decoding)
Rob Landley46968f72006-12-31 19:10:24 -050074 char selectors[32768]; // nSelectors=15 bits
75 struct group_data groups[MAX_GROUPS]; // huffman coding tables
Rob Landleyd3236c12007-12-24 19:53:20 -060076 int symTotal, groupCount, nSelectors;
77 unsigned char symToByte[256], mtfSymbol[256];
78
79 // The CRC values stored in the block header and calculated from the data
80 unsigned int crc32Table[256];
81
82 // Second pass decompression data (burrows-wheeler transform)
83 unsigned int dbufSize;
84 struct bwdata bwdata[THREADS];
85};
Rob Landley6973a1d2006-11-25 16:50:00 -050086
Rob Landley2de8f3e2006-11-26 17:19:18 -050087// Return the next nnn bits of input. All reads from the compressed input
88// are done through this function. All reads are big endian.
Rob Landleyd3236c12007-12-24 19:53:20 -060089static unsigned int get_bits(struct bunzip_data *bd, char bits_wanted)
Rob Landley6973a1d2006-11-25 16:50:00 -050090{
Rob Landley2de8f3e2006-11-26 17:19:18 -050091 unsigned int bits = 0;
Rob Landley6973a1d2006-11-25 16:50:00 -050092
Rob Landley2de8f3e2006-11-26 17:19:18 -050093 // If we need to get more data from the byte buffer, do so. (Loop getting
94 // one byte at a time to enforce endianness and avoid unaligned access.)
95 while (bd->inbufBitCount < bits_wanted) {
96
97 // If we need to read more data from file into byte buffer, do so
98 if (bd->inbufPos == bd->inbufCount) {
Rob Landley28a0dec2006-11-26 18:27:33 -050099 if (0 >= (bd->inbufCount = read(bd->in_fd, bd->inbuf, IOBUF_SIZE)))
Rob Landley6000f132007-01-18 22:00:12 -0500100 error_exit("Unexpected input EOF");
Rob Landley2de8f3e2006-11-26 17:19:18 -0500101 bd->inbufPos = 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500102 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500103
104 // Avoid 32-bit overflow (dump bit buffer to top of output)
105 if (bd->inbufBitCount>=24) {
106 bits = bd->inbufBits&((1<<bd->inbufBitCount)-1);
107 bits_wanted -= bd->inbufBitCount;
108 bits <<= bits_wanted;
109 bd->inbufBitCount = 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500110 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500111
112 // Grab next 8 bits of input from buffer.
113 bd->inbufBits = (bd->inbufBits<<8) | bd->inbuf[bd->inbufPos++];
114 bd->inbufBitCount += 8;
Rob Landley6973a1d2006-11-25 16:50:00 -0500115 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500116
117 // Calculate result
118 bd->inbufBitCount -= bits_wanted;
119 bits |= (bd->inbufBits>>bd->inbufBitCount) & ((1<<bits_wanted)-1);
Rob Landley6973a1d2006-11-25 16:50:00 -0500120
121 return bits;
122}
123
Rob Landleyd3236c12007-12-24 19:53:20 -0600124/* Read block header at start of a new compressed data block. Consists of:
125 *
126 * 48 bits : Block signature, either pi (data block) or e (EOF block).
127 * 32 bits : bw->headerCRC
128 * 1 bit : obsolete feature flag.
129 * 24 bits : origPtr (Burrows-wheeler unwind index, only 20 bits ever used)
130 * 16 bits : Mapping table index.
131 *[16 bits]: symToByte[symTotal] (Mapping table. For each bit set in mapping
132 * table index above, read another 16 bits of mapping table data.
133 * If correspondig bit is unset, all bits in that mapping table
134 * section are 0.)
135 * 3 bits : groupCount (how many huffman tables used to encode, anywhere
136 * from 2 to MAX_GROUPS)
137 * variable: hufGroup[groupCount] (MTF encoded huffman table data.)
138 */
139
140static int read_block_header(struct bunzip_data *bd, struct bwdata *bw)
Rob Landley6973a1d2006-11-25 16:50:00 -0500141{
Rob Landleyd3236c12007-12-24 19:53:20 -0600142 struct group_data *hufGroup;
143 int hh, ii, jj, kk, symCount, *base, *limit;
144 unsigned char uc;
Rob Landley6973a1d2006-11-25 16:50:00 -0500145
Rob Landley6000f132007-01-18 22:00:12 -0500146 // Read in header signature and CRC (which is stored big endian)
Rob Landleyd3236c12007-12-24 19:53:20 -0600147 ii = get_bits(bd, 24);
148 jj = get_bits(bd, 24);
149 bw->headerCRC = get_bits(bd,32);
Rob Landley2de8f3e2006-11-26 17:19:18 -0500150
Rob Landleyd3236c12007-12-24 19:53:20 -0600151 // Is this the EOF block with CRC for whole file? (Constant is "e")
152 if (ii==0x177245 && jj==0x385090) return RETVAL_LAST_BLOCK;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500153
Rob Landleyd3236c12007-12-24 19:53:20 -0600154 // Is this a valid data block? (Constant is "pi".)
155 if (ii!=0x314159 || jj!=0x265359) return RETVAL_NOT_BZIP_DATA;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500156
157 // We can add support for blockRandomised if anybody complains.
158 if (get_bits(bd,1)) return RETVAL_OBSOLETE_INPUT;
Rob Landleyd3236c12007-12-24 19:53:20 -0600159 if ((bw->origPtr = get_bits(bd,24)) > bd->dbufSize)
160 return RETVAL_DATA_ERROR;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500161
162 // mapping table: if some byte values are never used (encoding things
163 // like ascii text), the compression code removes the gaps to have fewer
164 // symbols to deal with, and writes a sparse bitfield indicating which
165 // values were present. We make a translation table to convert the symbols
166 // back to the corresponding bytes.
Rob Landleyd3236c12007-12-24 19:53:20 -0600167 hh = get_bits(bd, 16);
168 bd->symTotal = 0;
169 for (ii=0; ii<16; ii++) {
170 if (hh & (1 << (15 - ii))) {
171 kk = get_bits(bd, 16);
172 for (jj=0; jj<16; jj++)
173 if (kk & (1 << (15 - jj)))
174 bd->symToByte[bd->symTotal++] = (16 * ii) + jj;
Rob Landley6973a1d2006-11-25 16:50:00 -0500175 }
176 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500177
178 // How many different huffman coding groups does this block use?
Rob Landleyd3236c12007-12-24 19:53:20 -0600179 bd->groupCount = get_bits(bd,3);
180 if (bd->groupCount<2 || bd->groupCount>MAX_GROUPS) return RETVAL_DATA_ERROR;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500181
Rob Landleyd3236c12007-12-24 19:53:20 -0600182 // nSelectors: Every GROUP_SIZE many symbols we switch huffman coding
183 // tables. Each group has a selector, which is an index into the huffman
184 // coding table arrays.
185 //
186 // Read in the group selector array, which is stored as MTF encoded
Rob Landley0dc2d492007-01-17 13:58:08 -0500187 // bit runs. (MTF = Move To Front. Every time a symbol occurs it's moved
188 // to the front of the table, so it has a shorter encoding next time.)
Rob Landleyd3236c12007-12-24 19:53:20 -0600189 if (!(bd->nSelectors = get_bits(bd, 15))) return RETVAL_DATA_ERROR;
190 for (ii=0; ii<bd->groupCount; ii++) bd->mtfSymbol[ii] = ii;
191 for (ii=0; ii<bd->nSelectors; ii++) {
Rob Landley2de8f3e2006-11-26 17:19:18 -0500192
193 // Get next value
Rob Landleyd3236c12007-12-24 19:53:20 -0600194 for(jj=0;get_bits(bd,1);jj++)
195 if (jj>=bd->groupCount) return RETVAL_DATA_ERROR;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500196
Rob Landley0dc2d492007-01-17 13:58:08 -0500197 // Decode MTF to get the next selector, and move it to the front.
Rob Landleyd3236c12007-12-24 19:53:20 -0600198 uc = bd->mtfSymbol[jj];
199 memmove(bd->mtfSymbol+1, bd->mtfSymbol, jj);
200 bd->mtfSymbol[0] = bd->selectors[ii] = uc;
Rob Landley6973a1d2006-11-25 16:50:00 -0500201 }
Rob Landleyd3236c12007-12-24 19:53:20 -0600202
Rob Landley2de8f3e2006-11-26 17:19:18 -0500203 // Read the huffman coding tables for each group, which code for symTotal
204 // literal symbols, plus two run symbols (RUNA, RUNB)
Rob Landleyd3236c12007-12-24 19:53:20 -0600205 symCount = bd->symTotal+2;
206 for (jj=0; jj<bd->groupCount; jj++) {
Rob Landley8cca60d2008-06-28 01:07:34 -0500207 unsigned char length[MAX_SYMBOLS];
208 unsigned temp[MAX_HUFCODE_BITS+1];
209 int minLen, maxLen, pp;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500210
211 // Read lengths
Rob Landleyd3236c12007-12-24 19:53:20 -0600212 hh = get_bits(bd, 5);
213 for (ii = 0; ii < symCount; ii++) {
Rob Landley6973a1d2006-11-25 16:50:00 -0500214 for(;;) {
Rob Landleyd3236c12007-12-24 19:53:20 -0600215 // !hh || hh > MAX_HUFCODE_BITS in one test.
216 if (MAX_HUFCODE_BITS-1 < (unsigned)hh-1)
Rob Landleycac0ab02007-01-17 16:31:23 -0500217 return RETVAL_DATA_ERROR;
Rob Landley59b30172007-01-19 16:01:54 -0500218 // Grab 2 bits instead of 1 (slightly smaller/faster). Stop if
219 // first bit is 0, otherwise second bit says whether to
220 // increment or decrement.
Rob Landleyd3236c12007-12-24 19:53:20 -0600221 kk = get_bits(bd, 2);
222 if (kk & 2) hh += 1 - ((kk&1)<<1);
Rob Landley59b30172007-01-19 16:01:54 -0500223 else {
224 bd->inbufBitCount++;
225 break;
226 }
Rob Landley6973a1d2006-11-25 16:50:00 -0500227 }
Rob Landleyd3236c12007-12-24 19:53:20 -0600228 length[ii] = hh;
Rob Landley6973a1d2006-11-25 16:50:00 -0500229 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500230
231 // Find largest and smallest lengths in this group
232 minLen = maxLen = length[0];
Rob Landleyd3236c12007-12-24 19:53:20 -0600233 for (ii = 1; ii < symCount; ii++) {
234 if(length[ii] > maxLen) maxLen = length[ii];
235 else if(length[ii] < minLen) minLen = length[ii];
Rob Landley6973a1d2006-11-25 16:50:00 -0500236 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500237
Rob Landley6973a1d2006-11-25 16:50:00 -0500238 /* Calculate permute[], base[], and limit[] tables from length[].
239 *
240 * permute[] is the lookup table for converting huffman coded symbols
Rob Landley94515a22007-01-19 16:31:11 -0500241 * into decoded symbols. It contains symbol values sorted by length.
Rob Landley2c226852007-11-15 18:30:30 -0600242 *
Rob Landley94515a22007-01-19 16:31:11 -0500243 * base[] is the amount to subtract from the value of a huffman symbol
244 * of a given length when using permute[].
Rob Landley6973a1d2006-11-25 16:50:00 -0500245 *
246 * limit[] indicates the largest numerical value a symbol with a given
247 * number of bits can have. It lets us know when to stop reading.
248 *
Rob Landley2de8f3e2006-11-26 17:19:18 -0500249 * To use these, keep reading bits until value <= limit[bitcount] or
Rob Landley6973a1d2006-11-25 16:50:00 -0500250 * you've read over 20 bits (error). Then the decoded symbol
Rob Landley0dc2d492007-01-17 13:58:08 -0500251 * equals permute[hufcode_value - base[hufcode_bitcount]].
Rob Landley6973a1d2006-11-25 16:50:00 -0500252 */
Rob Landleyd3236c12007-12-24 19:53:20 -0600253 hufGroup = bd->groups+jj;
Rob Landley6973a1d2006-11-25 16:50:00 -0500254 hufGroup->minLen = minLen;
255 hufGroup->maxLen = maxLen;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500256
257 // Note that minLen can't be smaller than 1, so we adjust the base
258 // and limit array pointers so we're not always wasting the first
259 // entry. We do this again when using them (during symbol decoding).
Rob Landley0dc2d492007-01-17 13:58:08 -0500260 base = hufGroup->base-1;
261 limit = hufGroup->limit-1;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500262
Rob Landley28964802008-01-19 17:08:39 -0600263 // zero temp[] and limit[], and calculate permute[]
Rob Landley6973a1d2006-11-25 16:50:00 -0500264 pp = 0;
Rob Landleyd3236c12007-12-24 19:53:20 -0600265 for (ii = minLen; ii <= maxLen; ii++) {
266 temp[ii] = limit[ii] = 0;
267 for (hh = 0; hh < symCount; hh++)
268 if (length[hh] == ii)
269 hufGroup->permute[pp++] = hh;
270 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500271
Rob Landley94515a22007-01-19 16:31:11 -0500272 // Count symbols coded for at each bit length
Rob Landleyd3236c12007-12-24 19:53:20 -0600273 for (ii = 0; ii < symCount; ii++) temp[length[ii]]++;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500274
Rob Landley6973a1d2006-11-25 16:50:00 -0500275 /* Calculate limit[] (the largest symbol-coding value at each bit
276 * length, which is (previous limit<<1)+symbols at this level), and
277 * base[] (number of symbols to ignore at each bit length, which is
Rob Landley94515a22007-01-19 16:31:11 -0500278 * limit minus the cumulative count of symbols coded for already). */
Rob Landleyd3236c12007-12-24 19:53:20 -0600279 pp = hh = 0;
280 for (ii = minLen; ii < maxLen; ii++) {
281 pp += temp[ii];
282 limit[ii] = pp-1;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500283 pp <<= 1;
Rob Landleyd3236c12007-12-24 19:53:20 -0600284 base[ii+1] = pp-(hh+=temp[ii]);
Rob Landley6973a1d2006-11-25 16:50:00 -0500285 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500286 limit[maxLen] = pp+temp[maxLen]-1;
Rob Landley813840c2007-01-18 18:16:11 -0500287 limit[maxLen+1] = INT_MAX;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500288 base[minLen] = 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500289 }
Rob Landley6973a1d2006-11-25 16:50:00 -0500290
Rob Landleyd3236c12007-12-24 19:53:20 -0600291 return 0;
292}
293
294/* First pass, read block's symbols into dbuf[dbufCount].
295 *
296 * This undoes three types of compression: huffman coding, run length encoding,
297 * and move to front encoding. We have to undo all those to know when we've
298 * read enough input.
299 */
300
301static int read_huffman_data(struct bunzip_data *bd, struct bwdata *bw)
302{
303 struct group_data *hufGroup;
304 int hh, ii, jj, kk, runPos, dbufCount, symCount, selector, nextSym,
305 *byteCount, *base, *limit;
306 unsigned int *dbuf = bw->dbuf;
307 unsigned char uc;
308
Rob Landley2de8f3e2006-11-26 17:19:18 -0500309 // We've finished reading and digesting the block header. Now read this
310 // block's huffman coded symbols from the file and undo the huffman coding
311 // and run length encoding, saving the result into dbuf[dbufCount++] = uc
312
313 // Initialize symbol occurrence counters and symbol mtf table
Rob Landleyd3236c12007-12-24 19:53:20 -0600314 byteCount = bw->byteCount;
315 for(ii=0; ii<256; ii++) {
316 byteCount[ii] = 0;
317 bd->mtfSymbol[ii] = ii;
Rob Landley125a2c72007-01-20 12:30:19 -0500318 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500319
Rob Landley0dc2d492007-01-17 13:58:08 -0500320 // Loop through compressed symbols. This is the first "tight inner loop"
321 // that needs to be micro-optimized for speed. (This one fills out dbuf[]
322 // linearly, staying in cache more, so isn't as limited by DRAM access.)
Rob Landley2de8f3e2006-11-26 17:19:18 -0500323 runPos = dbufCount = symCount = selector = 0;
Rob Landleyd3236c12007-12-24 19:53:20 -0600324 // Some unnecessary initializations to shut gcc up.
325 base = limit = 0;
326 hufGroup = 0;
327 hh = 0;
328
Rob Landley2de8f3e2006-11-26 17:19:18 -0500329 for (;;) {
330
Rob Landleyd3236c12007-12-24 19:53:20 -0600331 // Have we reached the end of this huffman group?
Rob Landley2de8f3e2006-11-26 17:19:18 -0500332 if (!(symCount--)) {
Rob Landleyd3236c12007-12-24 19:53:20 -0600333 // Determine which huffman coding group to use.
Rob Landley2de8f3e2006-11-26 17:19:18 -0500334 symCount = GROUP_SIZE-1;
Rob Landleyd3236c12007-12-24 19:53:20 -0600335 if (selector >= bd->nSelectors) return RETVAL_DATA_ERROR;
336 hufGroup = bd->groups + bd->selectors[selector++];
Rob Landley2de8f3e2006-11-26 17:19:18 -0500337 base = hufGroup->base-1;
338 limit = hufGroup->limit-1;
Rob Landley6973a1d2006-11-25 16:50:00 -0500339 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500340
Rob Landleyd3236c12007-12-24 19:53:20 -0600341 // Read next huffman-coded symbol (into jj).
342 ii = hufGroup->minLen;
343 jj = get_bits(bd, ii);
344 while (jj > limit[ii]) {
345 // if (ii > hufGroup->maxLen) return RETVAL_DATA_ERROR;
346 ii++;
Rob Landley6973a1d2006-11-25 16:50:00 -0500347
Rob Landley813840c2007-01-18 18:16:11 -0500348 // Unroll get_bits() to avoid a function call when the data's in
349 // the buffer already.
Rob Landleyd3236c12007-12-24 19:53:20 -0600350 kk = bd->inbufBitCount
Rob Landley813840c2007-01-18 18:16:11 -0500351 ? (bd->inbufBits >> --(bd->inbufBitCount)) & 1
352 : get_bits(bd, 1);
Rob Landleyd3236c12007-12-24 19:53:20 -0600353 jj = (jj << 1) | kk;
Rob Landley6973a1d2006-11-25 16:50:00 -0500354 }
Rob Landleyd3236c12007-12-24 19:53:20 -0600355 // Huffman decode jj into nextSym (with bounds checking)
356 jj-=base[ii];
Rob Landley2de8f3e2006-11-26 17:19:18 -0500357
Rob Landleyd3236c12007-12-24 19:53:20 -0600358 if (ii > hufGroup->maxLen || (unsigned)jj >= MAX_SYMBOLS)
Rob Landley813840c2007-01-18 18:16:11 -0500359 return RETVAL_DATA_ERROR;
Rob Landleyd3236c12007-12-24 19:53:20 -0600360 nextSym = hufGroup->permute[jj];
Rob Landley2de8f3e2006-11-26 17:19:18 -0500361
362 // If this is a repeated run, loop collecting data
Rob Landley8aa0e942007-01-17 16:56:28 -0500363 if ((unsigned)nextSym <= SYMBOL_RUNB) {
Rob Landley2de8f3e2006-11-26 17:19:18 -0500364
365 // If this is the start of a new run, zero out counter
Rob Landley6973a1d2006-11-25 16:50:00 -0500366 if(!runPos) {
367 runPos = 1;
Rob Landleyd3236c12007-12-24 19:53:20 -0600368 hh = 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500369 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500370
Rob Landley6973a1d2006-11-25 16:50:00 -0500371 /* Neat trick that saves 1 symbol: instead of or-ing 0 or 1 at
372 each bit position, add 1 or 2 instead. For example,
373 1011 is 1<<0 + 1<<1 + 2<<2. 1010 is 2<<0 + 2<<1 + 1<<2.
374 You can make any bit pattern that way using 1 less symbol than
375 the basic or 0/1 method (except all bits 0, which would use no
376 symbols, but a run of length 0 doesn't mean anything in this
377 context). Thus space is saved. */
Rob Landleyd3236c12007-12-24 19:53:20 -0600378 hh += (runPos << nextSym); // +runPos if RUNA; +2*runPos if RUNB
Rob Landley6973a1d2006-11-25 16:50:00 -0500379 runPos <<= 1;
380 continue;
381 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500382
Rob Landley6973a1d2006-11-25 16:50:00 -0500383 /* When we hit the first non-run symbol after a run, we now know
384 how many times to repeat the last literal, so append that many
385 copies to our buffer of decoded symbols (dbuf) now. (The last
386 literal used is the one at the head of the mtfSymbol array.) */
Rob Landley2de8f3e2006-11-26 17:19:18 -0500387 if (runPos) {
388 runPos = 0;
Rob Landleyd3236c12007-12-24 19:53:20 -0600389 if (dbufCount+hh >= bd->dbufSize) return RETVAL_DATA_ERROR;
Rob Landley6973a1d2006-11-25 16:50:00 -0500390
Rob Landleyd3236c12007-12-24 19:53:20 -0600391 uc = bd->symToByte[bd->mtfSymbol[0]];
392 byteCount[uc] += hh;
393 while (hh--) dbuf[dbufCount++] = uc;
Rob Landley6973a1d2006-11-25 16:50:00 -0500394 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500395
396 // Is this the terminating symbol?
Rob Landleyd3236c12007-12-24 19:53:20 -0600397 if (nextSym>bd->symTotal) break;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500398
Rob Landley6973a1d2006-11-25 16:50:00 -0500399 /* At this point, the symbol we just decoded indicates a new literal
400 character. Subtract one to get the position in the MTF array
401 at which this literal is currently to be found. (Note that the
402 result can't be -1 or 0, because 0 and 1 are RUNA and RUNB.
403 Another instance of the first symbol in the mtf array, position 0,
404 would have been handled as part of a run.) */
Rob Landleyd3236c12007-12-24 19:53:20 -0600405 if (dbufCount>=bd->dbufSize) return RETVAL_DATA_ERROR;
406 ii = nextSym - 1;
407 uc = bd->mtfSymbol[ii];
Rob Landleyc4a55212007-01-17 18:24:17 -0500408 // On my laptop, unrolling this memmove() into a loop shaves 3.5% off
409 // the total running time.
Rob Landleyd3236c12007-12-24 19:53:20 -0600410 while(ii--) bd->mtfSymbol[ii+1] = bd->mtfSymbol[ii];
411 bd->mtfSymbol[0] = uc;
412 uc = bd->symToByte[uc];
Rob Landley2de8f3e2006-11-26 17:19:18 -0500413
414 // We have our literal byte. Save it into dbuf.
Rob Landley6973a1d2006-11-25 16:50:00 -0500415 byteCount[uc]++;
416 dbuf[dbufCount++] = (unsigned int)uc;
417 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500418
Rob Landley2de8f3e2006-11-26 17:19:18 -0500419 // Now we know what dbufCount is, do a better sanity check on origPtr.
Rob Landleyd3236c12007-12-24 19:53:20 -0600420 if (bw->origPtr >= (bw->writeCount = dbufCount)) return RETVAL_DATA_ERROR;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500421
Rob Landleyd3236c12007-12-24 19:53:20 -0600422 return 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500423}
424
Rob Landley2de8f3e2006-11-26 17:19:18 -0500425// Flush output buffer to disk
Rob Landleyd3236c12007-12-24 19:53:20 -0600426void flush_bunzip_outbuf(struct bunzip_data *bd, int out_fd)
Rob Landley6973a1d2006-11-25 16:50:00 -0500427{
Rob Landley2de8f3e2006-11-26 17:19:18 -0500428 if (bd->outbufPos) {
429 if (write(out_fd, bd->outbuf, bd->outbufPos) != bd->outbufPos)
Rob Landley6000f132007-01-18 22:00:12 -0500430 error_exit("Unexpected output EOF");
Rob Landley2de8f3e2006-11-26 17:19:18 -0500431 bd->outbufPos = 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500432 }
433}
434
Rob Landleyd3236c12007-12-24 19:53:20 -0600435void burrows_wheeler_prep(struct bunzip_data *bd, struct bwdata *bw)
436{
437 int ii, jj;
438 unsigned int *dbuf = bw->dbuf;
439 int *byteCount = bw->byteCount;
440
441 // Technically this part is preparation for the burrows-wheeler
442 // transform, but it's quick and convenient to do here.
443
444 // Turn byteCount into cumulative occurrence counts of 0 to n-1.
445 jj = 0;
446 for (ii=0; ii<256; ii++) {
447 int kk = jj + byteCount[ii];
448 byteCount[ii] = jj;
449 jj = kk;
450 }
451
452 // Use occurrence counts to quickly figure out what order dbuf would be in
453 // if we sorted it.
454 for (ii=0; ii < bw->writeCount; ii++) {
455 unsigned char uc = dbuf[ii];
456 dbuf[byteCount[uc]] |= (ii << 8);
457 byteCount[uc]++;
458 }
459
460 // blockRandomised support would go here.
461
462 // Using ii as position, jj as previous character, hh as current character,
463 // and uc as run count.
464 bw->dataCRC = 0xffffffffL;
465
466 /* Decode first byte by hand to initialize "previous" byte. Note that it
467 doesn't get output, and if the first three characters are identical
468 it doesn't qualify as a run (hence uc=255, which will either wrap
469 to 1 or get reset). */
470 if (bw->writeCount) {
471 bw->writePos = dbuf[bw->origPtr];
472 bw->writeCurrent = (unsigned char)bw->writePos;
473 bw->writePos >>= 8;
474 bw->writeRun = -1;
475 }
476}
477
478// Decompress a block of text to intermediate buffer
479int read_bunzip_data(struct bunzip_data *bd)
480{
481 int rc = read_block_header(bd, bd->bwdata);
482 if (!rc) rc=read_huffman_data(bd, bd->bwdata);
483
484 // First thing that can be done by a background thread.
485 burrows_wheeler_prep(bd, bd->bwdata);
486
487 return rc;
488}
489
Rob Landley2de8f3e2006-11-26 17:19:18 -0500490// Undo burrows-wheeler transform on intermediate buffer to produce output.
491// If !len, write up to len bytes of data to buf. Otherwise write to out_fd.
Rob Landleyd3236c12007-12-24 19:53:20 -0600492// Returns len ? bytes written : 0. Notice all errors are negative #'s.
493//
494// Burrows-wheeler transform is described at:
495// http://dogma.net/markn/articles/bwt/bwt.htm
496// http://marknelson.us/1996/09/01/bwt/
497
498int write_bunzip_data(struct bunzip_data *bd, struct bwdata *bw, int out_fd, char *outbuf, int len)
Rob Landley6973a1d2006-11-25 16:50:00 -0500499{
Rob Landleyd3236c12007-12-24 19:53:20 -0600500 unsigned int *dbuf = bw->dbuf;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500501 int count, pos, current, run, copies, outbyte, previous, gotcount = 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500502
Rob Landley2de8f3e2006-11-26 17:19:18 -0500503 for (;;) {
504
505 // If last read was short due to end of file, return last block now
Rob Landleyd3236c12007-12-24 19:53:20 -0600506 if (bw->writeCount < 0) return bw->writeCount;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500507
508 // If we need to refill dbuf, do it.
Rob Landleyd3236c12007-12-24 19:53:20 -0600509 if (!bw->writeCount) {
Rob Landley2de8f3e2006-11-26 17:19:18 -0500510 int i = read_bunzip_data(bd);
511 if (i) {
512 if (i == RETVAL_LAST_BLOCK) {
Rob Landleyd3236c12007-12-24 19:53:20 -0600513 bw->writeCount = i;
Rob Landley6973a1d2006-11-25 16:50:00 -0500514 return gotcount;
515 } else return i;
516 }
517 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500518
Rob Landleyd3236c12007-12-24 19:53:20 -0600519 // loop generating output
520 count = bw->writeCount;
521 pos = bw->writePos;
522 current = bw->writeCurrent;
523 run = bw->writeRun;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500524 while (count) {
525
526 // If somebody (like tar) wants a certain number of bytes of
527 // data from memory instead of written to a file, humor them.
528 if (len && bd->outbufPos>=len) goto dataus_interruptus;
Rob Landley6973a1d2006-11-25 16:50:00 -0500529 count--;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500530
531 // Follow sequence vector to undo Burrows-Wheeler transform.
532 previous = current;
533 pos = dbuf[pos];
534 current = pos&0xff;
535 pos >>= 8;
536
537 // Whenever we see 3 consecutive copies of the same byte,
538 // the 4th is a repeat count
539 if (run++ == 3) {
540 copies = current;
541 outbyte = previous;
542 current = -1;
Rob Landley6973a1d2006-11-25 16:50:00 -0500543 } else {
Rob Landley2de8f3e2006-11-26 17:19:18 -0500544 copies = 1;
545 outbyte = current;
Rob Landley6973a1d2006-11-25 16:50:00 -0500546 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500547
548 // Output bytes to buffer, flushing to file if necessary
549 while (copies--) {
550 if (bd->outbufPos == IOBUF_SIZE) flush_bunzip_outbuf(bd,out_fd);
Rob Landley6973a1d2006-11-25 16:50:00 -0500551 bd->outbuf[bd->outbufPos++] = outbyte;
Rob Landleyd3236c12007-12-24 19:53:20 -0600552 bw->dataCRC = (bw->dataCRC << 8)
553 ^ bd->crc32Table[(bw->dataCRC >> 24) ^ outbyte];
Rob Landley6973a1d2006-11-25 16:50:00 -0500554 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500555 if (current!=previous) run=0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500556 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500557
Rob Landleyd3236c12007-12-24 19:53:20 -0600558 // decompression of this block completed successfully
559 bw->dataCRC = ~(bw->dataCRC);
Rob Landley2de8f3e2006-11-26 17:19:18 -0500560 bd->totalCRC = ((bd->totalCRC << 1) | (bd->totalCRC >> 31))
Rob Landleyd3236c12007-12-24 19:53:20 -0600561 ^ bw->dataCRC;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500562
Rob Landleyd3236c12007-12-24 19:53:20 -0600563 // if this block had a crc error, force file level crc error.
564 if (bw->dataCRC != bw->headerCRC) {
565 bd->totalCRC = bw->headerCRC+1;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500566
Rob Landley6973a1d2006-11-25 16:50:00 -0500567 return RETVAL_LAST_BLOCK;
568 }
569dataus_interruptus:
Rob Landleyd3236c12007-12-24 19:53:20 -0600570 bw->writeCount = count;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500571 if (len) {
572 gotcount += bd->outbufPos;
573 memcpy(outbuf, bd->outbuf, len);
574
575 // If we got enough data, checkpoint loop state and return
576 if ((len-=bd->outbufPos)<1) {
577 bd->outbufPos -= len;
578 if (bd->outbufPos)
579 memmove(bd->outbuf, bd->outbuf+len, bd->outbufPos);
Rob Landleyd3236c12007-12-24 19:53:20 -0600580 bw->writePos = pos;
581 bw->writeCurrent = current;
582 bw->writeRun = run;
Rob Landley2de8f3e2006-11-26 17:19:18 -0500583
Rob Landley6973a1d2006-11-25 16:50:00 -0500584 return gotcount;
585 }
586 }
587 }
588}
589
Rob Landley2de8f3e2006-11-26 17:19:18 -0500590// Allocate the structure, read file header. If !len, src_fd contains
591// filehandle to read from. Else inbuf contains data.
Rob Landleyd3236c12007-12-24 19:53:20 -0600592int start_bunzip(struct bunzip_data **bdp, int src_fd, char *inbuf, int len)
Rob Landley6973a1d2006-11-25 16:50:00 -0500593{
Rob Landleyd3236c12007-12-24 19:53:20 -0600594 struct bunzip_data *bd;
Rob Landley7e849c52009-01-03 18:15:18 -0600595 unsigned int i;
Rob Landley6973a1d2006-11-25 16:50:00 -0500596
Rob Landley2de8f3e2006-11-26 17:19:18 -0500597 // Figure out how much data to allocate.
Rob Landleyd3236c12007-12-24 19:53:20 -0600598 i = sizeof(struct bunzip_data);
Rob Landley2de8f3e2006-11-26 17:19:18 -0500599 if (!len) i += IOBUF_SIZE;
600
601 // Allocate bunzip_data. Most fields initialize to zero.
Rob Landley6000f132007-01-18 22:00:12 -0500602 bd = *bdp = xzalloc(i);
Rob Landley2de8f3e2006-11-26 17:19:18 -0500603 if (len) {
604 bd->inbuf = inbuf;
605 bd->inbufCount = len;
606 bd->in_fd = -1;
Rob Landley6973a1d2006-11-25 16:50:00 -0500607 } else {
Rob Landley2de8f3e2006-11-26 17:19:18 -0500608 bd->inbuf = (char *)(bd+1);
609 bd->in_fd = src_fd;
Rob Landley6973a1d2006-11-25 16:50:00 -0500610 }
Rob Landley2de8f3e2006-11-26 17:19:18 -0500611
Rob Landleyb15b8fa2009-01-05 01:05:43 -0600612 crc_init(bd->crc32Table, 0);
Rob Landley2de8f3e2006-11-26 17:19:18 -0500613
Rob Landley2de8f3e2006-11-26 17:19:18 -0500614 // Ensure that file starts with "BZh".
615 for (i=0;i<3;i++)
616 if (get_bits(bd,8)!="BZh"[i]) return RETVAL_NOT_BZIP_DATA;
617
618 // Next byte ascii '1'-'9', indicates block size in units of 100k of
619 // uncompressed data. Allocate intermediate buffer for block.
620 i = get_bits(bd, 8);
Rob Landley6973a1d2006-11-25 16:50:00 -0500621 if (i<'1' || i>'9') return RETVAL_NOT_BZIP_DATA;
Rob Landleyd3236c12007-12-24 19:53:20 -0600622 bd->dbufSize = 100000*(i-'0')*THREADS;
623 for (i=0; i<THREADS; i++)
624 bd->bwdata[i].dbuf = xmalloc(bd->dbufSize * sizeof(int));
Rob Landley2de8f3e2006-11-26 17:19:18 -0500625
Rob Landleyd3236c12007-12-24 19:53:20 -0600626 return 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500627}
628
Rob Landley2de8f3e2006-11-26 17:19:18 -0500629// Example usage: decompress src_fd to dst_fd. (Stops at end of bzip data,
630// not end of file.)
Rob Landley6000f132007-01-18 22:00:12 -0500631void bunzipStream(int src_fd, int dst_fd)
Rob Landley6973a1d2006-11-25 16:50:00 -0500632{
Rob Landleyd3236c12007-12-24 19:53:20 -0600633 struct bunzip_data *bd;
634 int i, j;
Rob Landley6973a1d2006-11-25 16:50:00 -0500635
Rob Landley2de8f3e2006-11-26 17:19:18 -0500636 if (!(i = start_bunzip(&bd,src_fd,0,0))) {
Rob Landleyd3236c12007-12-24 19:53:20 -0600637 i = write_bunzip_data(bd,bd->bwdata,dst_fd,0,0);
638 if (i==RETVAL_LAST_BLOCK && bd->bwdata[0].headerCRC==bd->totalCRC)
639 i = 0;
Rob Landley6973a1d2006-11-25 16:50:00 -0500640 }
641 flush_bunzip_outbuf(bd,dst_fd);
Rob Landleyd3236c12007-12-24 19:53:20 -0600642 for (j=0; j<THREADS; j++) free(bd->bwdata[j].dbuf);
Rob Landley6973a1d2006-11-25 16:50:00 -0500643 free(bd);
Rob Landley6000f132007-01-18 22:00:12 -0500644 if (i) error_exit(bunzip_errors[-i]);
Rob Landley6973a1d2006-11-25 16:50:00 -0500645}