blob: a6253684883afc3c507ed776a6039be17da711cf [file] [log] [blame]
jnebf62cd2b2016-12-20 14:41:47 +01001#!python3
2"""Program to dump contents of Brotli compressed files showing the compression format.
3Jurjen N.E. Bos, 2016.
4I found the following issues with the Brotli format:
5- The distance alphabet has size 16+(48<<POSTFIX),
6 but the last symbols are useless.
7 It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter.
8- The block type code is useless if NBLTYPES==2, you would only need 1 symbol
9 anyway, so why don't you just switch to "the other" type?
10"""
11import struct
12from operator import itemgetter, methodcaller
13from itertools import accumulate, repeat
14from collections import defaultdict, deque
15from functools import partial
16
17class InvalidStream(Exception): pass
18#lookup table
19L, I, D = "literal", "insert&copy", "distance"
20pL, pI, pD = 'P'+L, 'P'+I, 'P'+D
21
22def outputCharFormatter(c):
23 """Show character in readable format
24 """
25 #TODO 2: allow hex only output
26 if 32<c<127: return chr(c)
27 elif c==10: return '\\n'
28 elif c==13: return '\\r'
29 elif c==32: return '" "'
30 else: return '\\x{:02x}'.format(c)
31
32def outputFormatter(s):
33 """Show string or char.
34 """
35 result = ''
36 def formatSubString(s):
37 for c in s:
38 if c==32: yield ' '
39 else: yield outputCharFormatter(c)
40 if len(result)<200: return ''.join(formatSubString(s))
41 else:
42 return ''.join(formatSubString(s[:100]))+'...'+ \
43 ''.join(formatSubString(s[-100:]))
44
45
46class BitStream:
47 """Represent a bytes object. Can read bits and prefix codes the way
48 Brotli does.
49 """
50 def __init__(self, byteString):
51 self.data = byteString
52 #position in bits: byte pos is pos>>3, bit pos is pos&7
53 self.pos = 0
54
55 def __repr__(self):
56 """Representation
57 >>> olleke
58 BitStream(pos=0:0)
59 """
60 return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7)
61
62 def read(self, n):
63 """Read n bits from the stream and return as an integer.
64 Produces zero bits beyond the stream.
65 >>> olleke.data[0]==27
66 True
67 >>> olleke.read(5)
68 27
69
70 >>> olleke
71 BitStream(pos=0:5)
72 """
73 value = self.peek(n)
74 self.pos += n
75 if self.pos>len(self.data)*8:
76 raise ValueError('Read past end of stream')
77 return value
78
79 def peek(self, n):
80 """Peek an n bit integer from the stream without updating the pointer.
81 It is not an error to read beyond the end of the stream.
82 >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803
83 True
84 >>> olleke.peek(15)
85 11803
86 >>> hex(olleke.peek(32))
87 '0x2e1b'
88 """
89 #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3]
90 #convert to int: int.from_bytes(..., 'little')
91 #shift out the bits from the first byte: >>(self.pos&7)
92 #mask unwanted bits: & (1<<n)-1
93 return int.from_bytes(
94 self.data[self.pos>>3:self.pos+n+7>>3],
95 'little')>>(self.pos&7) & (1<<n)-1
96
97 def readBytes(self, n):
98 """Read n bytes from the stream on a byte boundary.
99 """
100 if self.pos&7: raise ValueError('readBytes: need byte boundary')
101 result = self.data[self.pos>>3:(self.pos>>3)+n]
102 self.pos += 8*n
103 return result
104
105#-----------------------Symbol-------------------------------------------
106class Symbol:
107 """A symbol in a code.
108 Refers back to the code that contains it.
109 Index is the place in the alphabet of the symbol.
110 """
111 __slots__ = 'code', 'index'
112 def __init__(self, code, value):
113 self.code = code
114 self.index = value
115
116 def __repr__(self):
117 return 'Symbol({}, {})'.format(self.code.name, self.index)
118
119 def __len__(self):
120 """Number of bits in the prefix notation of this symbol
121 """
122 return self.code.length(self.index)
123
124 def __int__(self):
125 return self.index
126
127 #these routines call equivalent routine in Code class
128 def bitPattern(self):
129 """Value of the symbol in the stream
130 """
131 return self.code.bitPattern(self.index)
132
133 def extraBits(self):
134 """Number of extra bits to read for this symbol
135 """
136 return self.code.extraBits(self.index)
137
138 def __str__(self):
139 """Short descriptor of the symbol without extra bits.
140 """
141 return self.code.mnemonic(self.index)
142
143 #requiring optional extra bits, if self.code supports them
144 def value(self, extra=None):
145 """The value used for processing. Can be a tuple.
146 with optional extra bits
147 """
148 if isinstance(self.code, WithExtra):
149 if not 0<=extra<1<<self.extraBits():
150 raise ValueError("value: extra value doesn't fit in extraBits")
151 return self.code.value(self.index, extra)
152 if extra is not None:
153 raise ValueError('value: no extra bits for this code')
154 return self.code.value(self.index)
155
156 def explanation(self, extra=None):
157 """Long explanation of the value from the numeric value
158 with optional extra bits
159 Used by Layout.verboseRead when printing the value
160 """
161 if isinstance(self.code, WithExtra):
162 return self.code.callback(self, extra)
163 return self.code.callback(self)
164
165#========================Code definitions==================================
166class RangeDecoder:
167 """A decoder for the Code class that assumes the symbols
168 are encoded consecutively in binary.
169 It all depends on the "alphabetSize" property.
170 The range runs from 0 to alphabetSize-1.
171 This is the default decoder.
172 """
173 def __init__(self, *, alphabetSize=None, bitLength=None, **args):
174 if bitLength is not None: alphabetSize = 1<<bitLength
175 if alphabetSize is not None:
176 self.alphabetSize = alphabetSize
177 self.maxLength = (alphabetSize-1).bit_length()
178
179 def __len__(self):
180 return self.alphabetSize
181
182 def __iter__(self):
183 """Produce all symbols.
184 """
185 return map(partial(Symbol, self), range(len(self)))
186
187 def __getitem__(self, index):
188 if index>=self.alphabetSize: raise ValueError('index out of range')
189 return Symbol(self, index)
190
191 def bitPattern(self, index):
192 return '{:0{}b}'.format(index, self.maxLength)
193
194 def length(self, index):
195 """Encoding length of given symbol.
196 Does not depend on index in this case.
197 """
198 return self.maxLength
199
200 def decodePeek(self, data):
201 """Find which symbol index matches the given data (from peek, as a number)
202 and return the number of bits decoded.
203 Can also be used to figure out length of a symbol.
204 """
205 return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1)
206
207class PrefixDecoder:
208 """A decoder for the Code class that uses a prefix code.
209 The code is determined by encoding:
210 encode[p] gives the index corresponding to bit pattern p.
211 Used setDecode(decodeTable) to switch the decoder from the default
212 to a prefix decoder, or pass decodeTable at init.
213 You can also use setLength(lengthTable)
214 to define the encoding from the lengths.
215 The set of symbol values does not need to be consecutive.
216 """
217 def __init__(self, *, decodeTable=None, **args):
218 if decodeTable is not None: self.setDecode(decodeTable)
219
220 def __len__(self):
221 return len(self.decodeTable)
222
223 def __iter__(self):
224 def revBits(index):
225 return self.bitPattern(index)[::-1]
226 return (
227 Symbol(self, index)
228 for index in sorted(self.decodeTable.values(), key=revBits)
229 )
230
231 def __getitem__(self, index):
232 if index not in self.lengthTable:
233 raise ValueError('No symbol {}[{}]'.format(
234 self.__class__.__name__, index))
235 return Symbol(self, index)
236
237 def bitPattern(self, index):
238 bits = next(b for (b,s) in self.decodeTable.items() if s==index)
239 return '{:0{}b}'.format(bits, self.length(index))
240
241 def length(self, index):
242 """Encoding length of given symbol.
243 """
244 return self.lengthTable[index]
245
246 def decodePeek(self, data):
247 """Find which symbol index matches the given data (from peek, as a number)
248 and return the number of bits decoded.
249 Can also be used to figure out length of a symbol.
250 """
251 #do binary search for word length
252 #invariant: lo<=length<=hi
253 lo, hi = self.minLength, self.maxLength
254 while lo<=hi:
255 mid = lo+hi>>1
256 #note lo<=mid<hi at this point
257 mask = (1<<mid)-1
258 #lets see what happens if we guess length is mid
259 try: index = self.decodeTable[data&mask]
260 except KeyError:
261 #too many bits specified, reduce estimated length
262 hi = mid-1
263 continue
264 #we found a symbol, but there could be a longer match
265 symbolLength = self.lengthTable[index]
266 if symbolLength<=mid:
267 #all bits match, symbol must be right
268 return symbolLength, Symbol(self, index)
269 #there must be more bits to match
270 lo = mid+1
271 return lo, Symbol(self, index)
272
273 #routine to set up the tables
274 def setDecode(self, decodeTable):
275 """Store decodeTable,
276 and compute lengthTable, minLength, maxLength from encodings.
277 """
278 self.decodeTable = decodeTable
279 #set of symbols with unknown length
280 todo = set(decodeTable)
281 #bit size under investigation
282 maskLength = 0
283 lengthTable = {}
284 while todo:
285 mask = (1<<maskLength)-1
286 #split the encodings that we didn't find yet using b bits
287 splitSymbols = defaultdict(list)
288 for s in todo: splitSymbols[s&mask].append(s)
289 #unique encodings have a length of maskLength bits
290 #set length, and remove from todo list
291 for s,subset in splitSymbols.items():
292 if len(subset)==1:
293 lengthTable[self.decodeTable[s]] = maskLength
294 todo.remove(s)
295 #now investigate with longer mask
296 maskLength +=1
297 #save result
298 self.lengthTable = lengthTable
299 self.minLength = min(lengthTable.values())
300 self.maxLength = max(lengthTable.values())
301 self.switchToPrefix()
302
303 def setLength(self, lengthTable):
304 """Given the bit pattern lengths for symbols given in lengthTable,
305 set decodeTable, minLength, maxLength
306 """
307 self.lengthTable = lengthTable
308 self.minLength = min(lengthTable.values())
309 self.maxLength = max(lengthTable.values())
310 #compute the backwards codes first; then reverse them
311 #compute (backwards) first code for every separate lengths
312 nextCodes = []
313 #build codes for each length, from right to left
314 code = 0
315 for bits in range(self.maxLength+1):
316 code <<= 1
317 nextCodes.append(code)
318 code += sum(x==bits for x in lengthTable.values())
319 self.decodeTable = {}
320 #count codes for each length, and store reversed in the table
321 for symbol in sorted(lengthTable):
322 bits = lengthTable[symbol]
323 bitpattern = '{:0{}b}'.format(nextCodes[bits], bits)
324 self.decodeTable[int(bitpattern[::-1], 2)] = symbol
325 nextCodes[bits] += 1
326 self.switchToPrefix()
327
328 def switchToPrefix(self):
329 """This routine makes sure the prefix decoder is activated.
330 """
331 self.mode = PrefixDecoder
332
333class Code(RangeDecoder, PrefixDecoder):
334 """An alphabet of symbols, that can be read from a stream.
335 If you use setDecode or setLength, you have a prefix code,
336 otherwise you have a range code.
337 Features:
338 code[index] produces symbol with given index
339 value(index): value of symbol
340 mnemonic(index): short description of symbol
341 explanation(index): show meaning of symbol, shown in Layout.verboseRead
342 iter(code): produce all symbols in some order
343 name: show as context in Layout.verboseRead
344 """
345 name = '?'
346 #callback is a function that gets the symbol and the extra bits
347 #default callback calls explanation
348 def __init__(self, name=None, *, callback=None, description='', **args):
349 """Don't forget to set either alphabetSize or decodeTable
350 """
351 #set name when provided, otherwise take class variable
352 if name is not None: self.name = name
353 if callback is not None: self.callback = callback
354 self.description = description
355 #mode switch
356 if 'bitLength' in args or 'alphabetSize' in args:
357 self.mode = RangeDecoder
358 RangeDecoder.__init__(self, **args)
359 elif 'decodeTable' in args:
360 self.mode = PrefixDecoder
361 PrefixDecoder.__init__(self, **args)
362 else:
363 super().__init__(**args)
364
365 def __repr__(self):
366 return self.__class__.__name__+' '+self.name
367
368 #the routines that get switched between RangeDecoder and PrefixDecoder
369 def __len__(self): return self.mode.__len__(self)
370 def __iter__(self): return self.mode.__iter__(self)
371 def __getitem__(self, index): return self.mode.__getitem__(self, index)
372 def bitPattern(self, index): return self.mode.bitPattern(self, index)
373 def length(self, index): return self.mode.length(self, index)
374 def decodePeek(self, data): return self.mode.decodePeek(self, data)
375 #general routines
376 def value(self, index, extra=None):
377 """Get value of symbol for computations.
378 Override where needed.
379 """
380 if extra is not None:
381 raise ValueError('value: no extra for this symbol')
382 return index
383
384 def mnemonic(self, index):
385 """Give mnemonic of symbol.
386 Override where needed.
387 """
388 return str(self.value(index))
389
390 def callback(self, symbol):
391 return self.explanation(symbol.index)
392
393 def explanation(self, index):
394 """Long explanation of the value from the numeric value
395 This is a default routine.
396 You can customize in three ways:
397 - set description to add some text
398 - override to get more control
399 - set callback to make it dependent on you local variables
400 """
401 value = self.value(index)
402 return '{0}{1}: {2}'.format(
403 self.description and self.description+': ',
404 self.bitPattern(index),
405 value,
406 )
407
408 def extraBits(self, index):
409 return 0
410
411 #Routines that use the decode interface
412 def showCode(self, width=80):
413 """Show all words of the code in a nice format.
414 """
415 #make table of all symbols with binary strings
416 symbolStrings = [
417 (self.bitPattern(s.index), self.mnemonic(s.index))
418 for s in self
419 ]
420 #determine column widths the way Lisp programmers do it
421 leftColWidth, rightColWidth = map(max, map(
422 map,
423 repeat(len),
424 zip(*symbolStrings)
425 ))
426 colwidth = leftColWidth+rightColWidth
427 columns = 81//(colwidth+2)
428 rows = -(-len(symbolStrings)//columns)
429 def justify(bs):
430 b,s = bs
431 return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth)
432 for i in range(rows):
433 print(' '.join(map(justify, symbolStrings[i::rows])).rstrip())
434
435 def readTuple(self, stream):
436 """Read symbol from stream. Returns symbol, length.
437 """
438 length, symbol = self.decodePeek(stream.peek(self.maxLength))
439 stream.pos += length
440 return length, symbol
441
442 def readTupleAndExtra(self, stream):
443 return self.readTuple(stream)+(0, None)
444
445class WithExtra(Code):
446 """Extension for Code so that symbol may have extra bits associated.
447 If you supply an extraTable, you can use extraBits
448 You can define an extraTable,
449 which allows to call extraBits to get the number of extraBits.
450 Otherwise, you can supply extraBits yourself.
451 Routine readTupleAndExtra now reads the extra bits too.
452 Value probably needs to be overridden; see Enumerator.
453 Note: this does not give you an decodeTable.
454 """
455 #redefine these if you don't want to use an extraTable
456 def extraBits(self, index):
457 """Get the number of extra bits for this symbol.
458 """
459 return self.extraTable[index]
460
461 def mnemonic(self, index):
462 """This value must be independent of extra.
463 """
464 return str(index)
465
466 def readTupleAndExtra(self, stream):
467 """Read symbol and extrabits from stream.
468 Returns symbol length, symbol, extraBits, extra
469 >>> olleke.pos = 6
470 >>> MetablockLengthAlphabet().readTupleAndExtra(olleke)
471 (2, Symbol(MLEN, 4), 16, 46)
472 """
473 length, symbol = self.decodePeek(stream.peek(self.maxLength))
474 stream.pos += length
475 extraBits = self.extraBits(symbol.index)
476 return length, symbol, extraBits, stream.read(extraBits)
477
478 def explanation(self, index, extra=None):
479 """Expanded version of Code.explanation supporting extra bits.
480 If you don't supply extra, it is not mentioned.
481 """
482 extraBits = 0 if extra is None else self.extraBits(index)
483 if not hasattr(self, 'extraTable'):
484 formatString = '{0}{3}'
485 lo = hi = value = self.value(index, extra)
486 elif extraBits==0:
487 formatString = '{0}{2}: {3}'
488 lo, hi = self.span(index)
489 value = lo
490 else:
491 formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}'
492 lo, hi = self.span(index)
493 value = lo+extra
494 return formatString.format(
495 self.description and self.description+': ',
496 'x'*extraBits,
497 self.bitPattern(index),
498 lo, hi,
499 extra,
500 value,
501 )
502
503 def callback(self, symbol, extra):
504 return self.explanation(symbol.index, extra)
505
506class BoolCode(Code):
507 """Same as Code(bitLength=1), but shows a boolean.
508 """
509 def __init__(self, name=None, **args):
510 super().__init__(name, bitLength=1, **args)
511
512 def value(self, index, extra=None):
513 return bool(super().value(index, extra))
514
515class Enumerator(WithExtra):
516 """Code that is defined by the ExtraTable.
517 extraTable is a class variable that contains
518 the extraBits of the symbols from 0
519 value0 contains the value of symbol 0
520 encodings is not neccessary, but allowed.
521 Note: place for FixedCode to make sure extraBits works
522 """
523 def __init__(self, name=None, **args):
524 #if there is no decodeTable to determine length, compute it ourselves
525 if 'decodeTable' not in args:
526 args['alphabetSize'] = len(self.extraTable)
527 super().__init__(name, **args)
528
529 def __len__(self):
530 return len(self.extraTable)
531
532 def __getitem__(self, index):
533 """Faster than PrefixDecoder
534 """
535 if index>=len(self.extraTable):
536 raise ValueError("No symbol {}[{}]".format(
537 self.__class__.__name__, index))
538 return Symbol(self, index)
539
540 def value(self, index, extra):
541 """Override if you don't define value0 and extraTable
542 """
543 lower, upper = self.span(index)
544 value = lower+(extra or 0)
545 if value>upper:
546 raise ValueError('value: extra out of range')
547 return value
548
549 def span(self, index):
550 """Give the range of possible values in a tuple
551 Useful for mnemonic and explanation
552 """
553 lower = self.value0+sum(1<<x for x in self.extraTable[:index])
554 upper = lower+(1<<self.extraTable[index])
555 return lower, upper-1
556
557#======================Code subclasses======================================
558#Alphabets used in the metablock header----------------------------------
559#For prefix codes
560class PrefixCodeHeader(WithExtra):
561 """Header of prefix codes.
562 """
563 def __init__(self, codename):
564 super().__init__('PFX', bitLength=2)
565 #this is the name of the code that it describes
566 self.codename = codename
567
568 def extraBits(self, index):
569 return 2 if index==1 else 0
570
571 def value(self, index, extra):
572 """Returns ('Simple', #codewords) or ('Complex', HSKIP)
573 """
574 if index==1:
575 if extra>3:
576 raise ValueError('value: extra out of range')
577 return 'Simple', extra+1
578 if extra:
579 raise ValueError('value: extra out of range')
580 return 'Complex', index
581
582 def explanation(self, index, extra):
583 if index==1:
584 return '{} is simple with {} code word{}'.format(
585 self.codename, extra+1, 's' if extra else '')
586 lengths = [1, 2, 3, 4, 0, 5, 17, 6]
587 return '{} is complex with lengths {}...'.format(
588 self.codename,
589 ','.join(
590 map(str, lengths[index:index+5]))
591 )
592
593class TreeShapeAlhabet(BoolCode):
594 """The bit used to indicate if four word code is "deep" or "wide"
595 """
596 name = 'SHAPE'
597 def value(self, index):
598 return [(2,2,2,2), (1,2,3,3)][index]
599
600 def explanation(self, index):
601 return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index))
602
603class LengthOfLengthAlphabet(Code):
604 """For use in decoding complex code descriptors.
605 >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('')
606 >>> print(lengthOfLengthAlphabet[2])
607 coded with 2 bits
608 >>> len(lengthOfLengthAlphabet[0])
609 2
610 >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)]
611 [2, 4, 3, 2, 2, 4]
612 >>> lengthOfLengthAlphabet.showCode()
613 00:skipped 01:coded with 4 bits 0111:coded with 1 bits
614 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits
615 """
616 decodeTable = {
617 0b00:0, 0b10:3,
618 0b0111:1, 0b01:4,
619 0b011:2, 0b1111:5,
620 }
621
622 def __init__(self, name=None, **args):
623 super().__init__(name, decodeTable=self.decodeTable, **args)
624
625 def mnemonic(self, index):
626 if index==0: return 'skipped'
627 return 'coded with {} bits'.format(index)
628
629 def explanation(self, index, extra=None):
630 return self.description+': '+self.mnemonic(index)
631
632class LengthAlphabet(WithExtra):
633 """Length of symbols
634 Used during construction of a code.
635 """
636 def __init__(self, name):
637 super().__init__(name, alphabetSize=18)
638
639 def extraBits(self, index):
640 return {16:2, 17:3}.get(index, 0)
641
642 def mnemonic(self, index):
643 if index==0: return 'unused'
644 elif index==16: return 'rep xx'
645 elif index==17: return 'zero xxx'
646 else: return 'len {}'.format(index)
647
648 def explanation(self, index, extra):
649 return self.description.format(self[index], extra)
650
651 def value(self, index, extra):
652 #the caller got the length already, so extra is enough
653 return extra
654
655#Stream header
656class WindowSizeAlphabet(Code):
657 """The alphabet used for window size in the stream header.
658 >>> WindowSizeAlphabet()[10].explanation()
659 'windowsize=(1<<10)-16=1008'
660 """
661 decodeTable = {
662 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22,
663 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23,
664 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24,
665 0b1010001: 13, 0b0000001: 17, 0b1001: 21,
666 0b0010001: None,
667 }
668
669 name = 'WSIZE'
670
671 def __init__(self, name=None):
672 super().__init__(name, decodeTable=self.decodeTable)
673
674 def value(self, index):
675 #missing value gives index None
676 if index is None: return None
677 return (1<<index)-16
678
679 def explanation(self, index):
680 return 'windowsize=(1<<{})-16={}'.format(
681 index, (1<<index)-16)
682
683#Metablock
684class MetablockLengthAlphabet(WithExtra):
685 """Used for the meta block length;
686 also indicates a block with no data
687 >>> metablockLengthAlphabet = MetablockLengthAlphabet()
688 >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0])
689 Symbol(MLEN, 0)
690 'empty'
691 >>> metablockLengthAlphabet[3]
692 Traceback (most recent call last):
693 ...
694 ValueError: No symbol MetablockLengthAlphabet[3]
695 >>> print(metablockLengthAlphabet[4])
696 hhhh00
697 >>> metablockLengthAlphabet[4].value(0x1000)
698 4097
699 >>> metablockLengthAlphabet[5].value(0x1000)
700 Traceback (most recent call last):
701 ...
702 InvalidStream: Zeros in high nibble of MLEN
703 >>> metablockLengthAlphabet[5].explanation(0x12345)
704 'data length: 12345h+1=74566'
705 >>> metablockLengthAlphabet.showCode()
706 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty
707 """
708 decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6}
709
710 name = 'MLEN'
711 def __init__(self, name=None):
712 super().__init__(name, decodeTable=self.decodeTable)
713
714 def extraBits(self, index):
715 return index*4
716
717 def mnemonic(self, index):
718 if index==0: return 'empty'
719 return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
720
721 def value(self, index, extra):
722 extraBits = self.extraBits(index)
723 if not 0<=extra<1<<extraBits:
724 raise ValueError('value: extra out of range')
725 if index==0: return 0
726 if index>4 and extra>>extraBits-4==0: raise InvalidStream(
727 'Zeros in high nibble of MLEN')
728 return extra+1
729
730 def explanation(self, index, extra):
731 if index==0: return '11: empty block'
732 extraBits = self.extraBits(index)
733 return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1)
734
735
736class ReservedAlphabet(BoolCode):
737 """The reserved bit that must be zero.
738 """
739 name = 'RSVD'
740 def value(self, index):
741 if index: raise ValueError('Reserved bit is not zero')
742
743 def explanation(self, index):
744 return 'Reserved (must be zero)'
745
746class FillerAlphabet(Code):
747 def __init__(self, *, streamPos):
748 super().__init__('SKIP', bitLength=(-streamPos)&7)
749
750 def explanation(self, index):
751 return '{} bit{} ignored'.format(
752 self.length(index),
753 '' if self.length(index)==1 else 's',
754 )
755
756class SkipLengthAlphabet(WithExtra):
757 """Used for the skip length in an empty metablock
758 >>> skipLengthAlphabet = SkipLengthAlphabet()
759 >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0])
760 Symbol(SKIP, 0)
761 'empty'
762 >>> skipLengthAlphabet[4]
763 Traceback (most recent call last):
764 ...
765 ValueError: index out of range
766 >>> print(skipLengthAlphabet[3])
767 hhhhhh11
768 >>> skipLengthAlphabet[2].value(0x1000)
769 4097
770 >>> skipLengthAlphabet[3].value(0x1000)
771 Traceback (most recent call last):
772 ...
773 InvalidStream: Zeros in high byte of SKIPBYTES
774 >>> skipLengthAlphabet[3].explanation(0x12345)
775 'skip length: 12345h+1=74566'
776 >>> skipLengthAlphabet.showCode()
777 00:empty 01:hh01 10:hhhh10 11:hhhhhh11
778 """
779 def __init__(self):
780 super().__init__('SKIP', bitLength=2)
781
782 def extraBits(self, index):
783 return index*8
784
785 def mnemonic(self, index):
786 if index==0: return 'empty'
787 return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
788
789 def value(self, index, extra):
790 extraBits = self.extraBits(index)
791 if not 0<=extra<1<<extraBits:
792 raise ValueError('value: extra out of range')
793 if index==0: return 0
794 if index>1 and extra>>extraBits-8==0:
795 raise InvalidStream('Zeros in high byte of SKIPBYTES')
796 return extra+1
797
798 def explanation(self, index, extra):
799 if index==0: return '00: no skip'
800 extraBits = self.extraBits(index)
801 return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1)
802
803
804class TypeCountAlphabet(Enumerator):
805 """Used for giving block type counts and tree counts.
806 >>> TypeCountAlphabet(description='').showCode()
807 0:0 0101:xx,0101 1011:xxxxx,1011
808 0001:0001 1101:xxxxxx,1101 0111:xxx,0111
809 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111
810 """
811 decodeTable = {
812 0b0: 0, 0b1001: 5,
813 0b0001: 1, 0b1011: 6,
814 0b0011: 2, 0b1101: 7,
815 0b0101: 3, 0b1111: 8,
816 0b0111: 4,
817 }
818
819 value0 = 1
820 extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7]
821 name = 'BT#'
822
823 def __init__(self, name=None, *, description):
824 super().__init__(
825 name,
826 decodeTable=self.decodeTable,
827 description=description)
828
829 def mnemonic(self, index):
830 if index==0: return '0'
831 if index==1: return '0001'
832 return 'x'*(self.extraBits(index))+','+self.bitPattern(index)
833
834 def explanation(self, index, extra):
835 value = self.value(index, extra)
836 description = self.description
837 if value==1: description = description[:-1]
838 return '{}: {} {}'.format(
839 self.mnemonic(index),
840 value,
841 description)
842
843class BlockTypeAlphabet(Code):
844 """The block types; this code works for all three kinds.
845 >>> b = BlockTypeAlphabet('T', NBLTYPES=5)
846 >>> print(*(x for x in b))
847 prev +1 #0 #1 #2 #3 #4
848 """
849 def __init__(self, name, NBLTYPES, **args):
850 super().__init__(name, alphabetSize=NBLTYPES+2, **args)
851 self.NBLTYPES = NBLTYPES
852
853 def mnemonic(self, index):
854 if index==0: return 'prev'
855 elif index==1: return '+1'
856 else: return '#'+str(index-2)
857
858 def value(self, index):
859 return index-2
860
861 def explanation(self, index):
862 if index==0: return '0: previous'
863 elif index==1: return '1: increment'
864 else: return 'Set block type to: '+str(index-2)
865
866class BlockCountAlphabet(Enumerator):
867 """Block counts
868 >>> b = BlockCountAlphabet('L')
869 >>> print(b[25])
870 [24*x]: BC16625-16793840
871 """
872
873 value0 = 1
874 extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24]
875 def __init__(self, name, **args):
876 super().__init__(name, alphabetSize=26, **args)
877
878 def mnemonic(self, index):
879 extraBits = self.extraBits(index)
880 return '{}: BC{}-{}'.format(
881 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits),
882 *self.span(index))
883
884 def explanation(self, index, extra):
885 return 'Block count: '+super().explanation(index, extra)
886
887class DistanceParamAlphabet(WithExtra):
888 """The distance parameters NPOSTFIX and NDIRECT.
889 Although these are treated as two in the description, this is easier.
890 """
891 def __init__(self):
892 super().__init__('DIST', bitLength=2)
893
894 def extraBits(self, index):
895 return 4
896
897 def value(self, index, extra):
898 """Returns NPOSTFIX and NDIRECT<<NPOSTFIX
899 """
900 if extra>15:
901 raise ValueError('value: extra out of range')
902 return index, extra<<index
903
904 def explanation(self, index, extra):
905 return '{} postfix bits and {:04b}<<{}={} direct codes'.format(
906 index, extra, index, extra<<index)
907
908 def mnemonic(self, index):
909 return 'PF'+str(index)
910
911class LiteralContextMode(Code):
912 """For the literal context modes.
913 >>> LiteralContextMode().showCode()
914 00:LSB6 01:MSB6 10:UTF8 11:Signed
915 >>> LiteralContextMode().explanation(2)
916 'Context mode for type 9: 2(UTF8)'
917 """
918
919 def __init__(self, *, number=9):
920 super().__init__('LC'+str(number), bitLength=2)
921 self.number = number
922
923 def mnemonic(self, index):
924 return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index]
925
926 def explanation(self, index):
927 return 'Context mode for type {}: {}({})'.format(
928 self.number,
929 index,
930 self.mnemonic(index))
931
932class RLEmaxAlphabet(Enumerator):
933 """Used for describing the run length encoding used for describing context maps.
934 >>> RLEmaxAlphabet().showCode()
935 0:1 1:more
936 """
937 value0 = 0
938 extraTable = [0, 4]
939 name = 'RLE#'
940
941 def mnemonic(self, index):
942 return ['1', 'more'][index]
943
944 def explanation(self, index, extra):
945 description = self.description and self.description+': '
946 if index==0: return description+'No RLE coding'
947 return '{}xxxx 1: RLEMAX={}'.format(description, extra+1)
948
949class TreeAlphabet(WithExtra):
950 """The alphabet to enumerate entries (called trees) in the context map.
951 parameters are RLEMAX and NTREES
952 >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5)
953 >>> len(t)
954 8
955 >>> print(t[2])
956 xx+4 zeroes
957 >>> t[3].explanation(2)
958 '8+010=10 zeroes'
959 >>> t[0].value(0)
960 (1, 0)
961 """
962 name = 'CMI'
963 def __init__(self, name=None, *, RLEMAX, NTREES, **args):
964 super().__init__(name, alphabetSize=RLEMAX+NTREES, **args)
965 self.RLEMAX = RLEMAX
966 self.NTREES = NTREES
967
968 def extraBits(self, index):
969 if 0<index<=self.RLEMAX: return index
970 return 0
971
972 def mnemonic(self, index):
973 if index==0: return 'map #0'
974 if index<=self.RLEMAX:
975 return '{}+{} zeroes'.format('x'*index, 1<<index)
976 return 'map #{}'.format(index-self.RLEMAX)
977
978 def value(self, index, extra):
979 """Give count and value."""
980 index = index
981 if index==0: return 1, 0
982 if index<=self.RLEMAX: return (1<<index)+extra, 0
983 return 1, index-self.RLEMAX
984
985 def explanation(self, index, extra):
986 description = self.description and self.description+': '
987 if index==0: return description+'map #0'
988 if index<=self.RLEMAX:
989 return '{}+{:0{}b}={} zeroes'.format(
990 (1<<index),
991 extra, self.extraBits(index),
992 (1<<index)+extra)
993 return '{}map #{}-{}={}'.format(
994 description,
995 index, self.RLEMAX, index-self.RLEMAX)
996
997#Prefix alphabets for the data stream----------------------------------
998class LiteralAlphabet(Code):
999 """Alphabet of symbols.
1000 """
1001 minLength = maxLength = 8
1002 def __init__(self, number):
1003 super().__init__('L'+str(number), alphabetSize=1<<8)
1004
1005 def mnemonic(self, index):
1006 return outputCharFormatter(index)
1007
1008 def value(self, index, extra=None):
1009 return index
1010
1011 def explanation(self, index, extra=None):
1012 return self.mnemonic(index)
1013
1014class InsertLengthAlphabet(Enumerator):
1015 """Intern code for insert counts
1016 """
1017 value0 = 0
1018 extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24]
1019
1020class CopyLengthAlphabet(Enumerator):
1021 value0 = 2
1022 extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24]
1023
1024class InsertAndCopyAlphabet(WithExtra):
1025 """The insert and copy code
1026 >>> for x in range(0,704,704//13):
1027 ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x])
1028 0 I0C2&D=0
1029 110110 I6+xC8&D=0
1030 1101100 I5C22+xxx&D=0
1031 10100010 I4C4
1032 11011000 I3C10+x
1033 100001110 I14+xxC8
1034 101000100 I10+xxC22+xxx
1035 101111010 I98+xxxxxC14+xx
1036 110110000 I6+xC70+xxxxx
1037 111100110 I1090+[10*x]C8
1038 1000011100 I26+xxxC326+[8*x]
1039 1001010010 I322+[8*x]C14+xx
1040 1010001000 I194+[7*x]C70+xxxxx
1041 1010111110 I22594+[24*x]C1094+[10*x]
1042 """
1043 insertLengthAlphabet = InsertLengthAlphabet(None)
1044 copyLengthAlphabet = CopyLengthAlphabet(None)
1045
1046 def __init__(self, number=''):
1047 super().__init__('IC'+str(number), bitLength=10)
1048
1049 def __len__(self):
1050 return 704
1051
1052 def extraBits(self, index):
1053 insertSymbol, copySymbol, dist0 = self.splitSymbol(index)
1054 return InsertLengthAlphabet.extraTable[insertSymbol.index] + \
1055 CopyLengthAlphabet.extraTable[copySymbol.index]
1056
1057 def splitSymbol(self, index):
1058 """Give relevant values for computations:
1059 (insertSymbol, copySymbol, dist0flag)
1060 """
1061 #determine insert and copy upper bits from table
1062 row = [0,0,1,1,2,2,1,3,2,3,3][index>>6]
1063 col = [0,1,0,1,0,1,2,0,2,1,2][index>>6]
1064 #determine inserts and copy sub codes
1065 insertLengthCode = row<<3 | index>>3&7
1066 if row: insertLengthCode -= 8
1067 copyLengthCode = col<<3 | index&7
1068 return (
1069 Symbol(self.insertLengthAlphabet, insertLengthCode),
1070 Symbol(self.copyLengthAlphabet, copyLengthCode),
1071 row==0
1072 )
1073
1074 def mnemonic(self, index):
1075 """Make a nice mnemonic
1076 """
1077 i,c,d0 = self.splitSymbol(index)
1078 iLower, _ = i.code.span(i.index)
1079 iExtra = i.extraBits()
1080 cLower, _ = c.code.span(c.index)
1081 cExtra = c.extraBits()
1082 return 'I{}{}{}C{}{}{}{}'.format(
1083 iLower,
1084 '+' if iExtra else '',
1085 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra),
1086 cLower,
1087 '+' if cExtra else '',
1088 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra),
1089 '&D=0' if d0 else '')
1090
1091 def value(self, index, extra):
1092 i,c,d0 = self.splitSymbol(index)
1093 iExtra = i.extraBits()
1094 ce, ie = extra>>iExtra, extra&(1<<iExtra)-1
1095 insert = i.value(ie)
1096 copy = c.value(ce)
1097 return insert, copy, d0
1098
1099 def explanation(self, index, extra):
1100 insert, copy, d0 = self.value(index, extra)
1101 if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy)
1102 else: return 'Literal: {}, copy: {}'.format(insert, copy)
1103
1104class DistanceAlphabet(WithExtra):
1105 """Represent the distance encoding.
1106 Dynamically generated alphabet.
1107 This is what the documentation should have said:
1108 Ignoring offsets for the moment, the "long" encoding works as follows:
1109 Write the distance in binary as follows:
1110 1xy..yz..z, then the distance symbol consists of n..nxz..z
1111 Where:
1112 n is one less than number of bits in y
1113 x is a single bit
1114 y..y are n+1 extra bits (encoded in the bit stream)
1115 z..z is NPOSTFIX bits that are part of the symbol
1116 The offsets are so as to start at the lowest useable value:
1117 if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1
1118 then n..nxz..z is symbol -NDIRECT-16
1119 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1120 >>> print(d[4], d[17], d[34])
1121 last-1 1 10xx00-5
1122 >>> [str(d[x]) for x in range(26, 32)]
1123 ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5']
1124 """
1125 def __init__(self, number, *, NPOSTFIX, NDIRECT):
1126 self.NPOSTFIX = NPOSTFIX
1127 self.NDIRECT = NDIRECT
1128 #set length
1129 #Actually, not all symbols are used,
1130 #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX)
1131 super().__init__('D'+str(number),
1132 alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX))
1133
1134 def extraBits(self, index):
1135 """Indicate how many extra bits are needed to interpret symbol
1136 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1137 >>> [d[i].extraBits() for i in range(26)]
1138 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1139 >>> [d[i].extraBits() for i in range(26,36)]
1140 [1, 1, 1, 1, 1, 1, 1, 1, 2, 2]
1141 """
1142 if index<16+self.NDIRECT: return 0
1143 return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
1144
1145 def value(self, dcode, dextra):
1146 """Decode value of symbol together with the extra bits.
1147 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1148 >>> d[34].value(2)
1149 (0, 35)
1150 """
1151 if dcode<16:
1152 return [(1,0),(2,0),(3,0),(4,0),
1153 (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3),
1154 (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3)
1155 ][dcode]
1156 if dcode<16+self.NDIRECT:
1157 return (0,dcode-16)
1158 #we use the original formulas, instead of my clear explanation
1159 POSTFIX_MASK = (1 << self.NPOSTFIX) - 1
1160 ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
1161 hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX
1162 lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK
1163 offset = ((2 + (hcode & 1)) << ndistbits) - 4
1164 distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1
1165 return (0,distance)
1166
1167 def mnemonic(self, index, verbose=False):
1168 """Give mnemonic representation of meaning.
1169 verbose compresses strings of x's
1170 """
1171 if index<16:
1172 return ['last', '2last', '3last', '4last',
1173 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3',
1174 '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3'
1175 ][index]
1176 if index<16+self.NDIRECT:
1177 return str(index-16)
1178 #construct strings like "1xx01-15"
1179 index -= self.NDIRECT+16
1180 hcode = index >> self.NPOSTFIX
1181 lcode = index & (1<<self.NPOSTFIX)-1
1182 if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}'
1183 else: formatString = '1{0}{1}{4:+d}'
1184 return formatString.format(
1185 hcode&1,
1186 'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1),
1187 lcode, self.NPOSTFIX,
1188 self.NDIRECT+1-(4<<self.NPOSTFIX))
1189
1190 def explanation(self, index, extra):
1191 """
1192 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1193 >>> d[55].explanation(13)
1194 '11[1101]01-5: [0]+240'
1195 """
1196 extraBits = self.extraBits(index)
1197 extraString = '[{:0{}b}]'.format(extra, extraBits)
1198 return '{0}: [{1[0]}]{1[1]:+d}'.format(
1199 self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString),
1200 self.value(index, extra))
1201
1202#Classes for doing actual work------------------------------------------
1203class ContextModeKeeper:
1204 """For computing the literal context mode.
1205 You feed it characters, and it computes indices in the context map.
1206 """
1207 def __init__(self, mode):
1208 self.chars = deque([0,0], maxlen=2)
1209 self.mode = mode
1210
1211 def setContextMode(self, mode):
1212 """Switch to given context mode (0..3)"""
1213 self.mode = mode
1214 def getIndex(self):
1215 if self.mode==0: #LSB6
1216 return self.chars[1]&0x3f
1217 elif self.mode==1: #MSB6
1218 return self.chars[1]>>2
1219 elif self.mode==2: #UTF8: character class of previous and a bit of the second
1220 p2,p1 = self.chars
1221 return self.lut0[p1]|self.lut1[p2]
1222 elif self.mode==3: #Signed: initial bits of last two bytes
1223 p2,p1 = self.chars
1224 return self.lut2[p1]<<3|self.lut2[p2]
1225
1226 def add(self, index):
1227 """Adjust the context for output char (as int)."""
1228 self.chars.append(index)
1229
1230 #0: control #16: quote #32: ,:; #48: AEIOU
1231 #4: tab/lf/cr #20: % #36: . #52: BC..Z
1232 #8: space #24: (<[{ #40: = #56: aeiou
1233 #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z
1234 lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0,
1235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1236 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
1237 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
1238 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
1239 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
1240 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
1241 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0
1242 ]+[0,1]*32+[2,3]*32
1243 #0: space 1:punctuation 2:digit/upper 3:lower
1244 lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1246 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1247 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1248 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1249 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
1250 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1251 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0
1252 ]+[0]*96+[2]*32
1253 #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1
1254 lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7]
1255 assert len(lut0)==len(lut1)==len(lut2)==256
1256
1257class WordList:
1258 """Word list.
1259 >>> WordList().word(7, 35555)
1260 b'Program to '
1261 """
1262 NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10,
1263 10, 10, 10, 9, 9, 8, 7, 7, 8, 7,
1264 7, 6, 6, 5, 5]
1265 def __init__(self):
1266 self.file = open('dict', 'rb')
1267 self.compileActions()
1268
1269 def word(self, size, dist):
1270 """Get word
1271 """
1272 #split dist in index and action
1273 ndbits = self.NDBITS[size]
1274 index = dist&(1<<ndbits)-1
1275 action = dist>>ndbits
1276 #compute position in file
1277 position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index
1278 self.file.seek(position)
1279 return self.doAction(self.file.read(size), action)
1280
1281 def upperCase1(self, word):
1282 word = word.decode('utf8')
1283 word = word[0].upper()+word[1:]
1284 return word.encode('utf8')
1285
1286
1287 #Super compact form of action table.
1288 #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst
1289 actionTable = r"""
1290 0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_
1291 1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+.
1292 2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w
1293 3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+,
1294 4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+="
1295 5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+="
1296 6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_
1297 7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_
1298 8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\'
1299 9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+,
1300 10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+="
1301 11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_
1302 12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+,
1303 13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+(
1304 14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._
1305 15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+.
1306 16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\'
1307 17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._
1308 18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+="
1309 19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\'
1310 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\'
1311 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+.
1312 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+">
1313 23:w[:-3] 48:w[:-7] 98:_+w+=\'
1314 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+,
1315 """
1316
1317 def compileActions(self):
1318 """Build the action table from the text above
1319 """
1320 import re
1321 self.actionList = actions = [None]*121
1322 #Action 73, which is too long, looks like this when expanded:
1323 actions[73] = "b' the '+w+b' of the '"
1324 #find out what the columns are
1325 actionLines = self.actionTable.splitlines()
1326 colonPositions = [m.start()
1327 for m in re.finditer(':',actionLines[1])
1328 ]+[100]
1329 columns = [(colonPositions[i]-3,colonPositions[i+1]-3)
1330 for i in range(len(colonPositions)-1)]
1331 for line in self.actionTable.splitlines(keepends=False):
1332 for start,end in columns:
1333 action = line[start:end]
1334 #skip empty actions
1335 if not action or action.isspace(): continue
1336 #chop it up, and check if the colon is properly placed
1337 index, colon, action = action[:3], action[3], action[4:]
1338 assert colon==':'
1339 #remove filler spaces at right
1340 action = action.rstrip()
1341 #replace space symbols
1342 action = action.replace('_', ' ')
1343 wPos = action.index('w')
1344 #add quotes around left string when present
1345 #translation: any pattern from beginning, up to
1346 #(but not including) a + following by a w later on
1347 action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action)
1348 #add quotes around right string when present
1349 #translation: anything with a w in it, followed by a +
1350 #and a pattern up to the end
1351 #(there is no variable lookbehind assertion,
1352 #so we have to copy the pattern)
1353 action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action)
1354 #expand shortcut for uppercaseAll
1355 action = action.replace(".U", ".upper()")
1356 #store action
1357 actions[int(index)] = action
1358
1359 def doAction(self, w, action):
1360 """Perform the proper action
1361 """
1362 #set environment for the UpperCaseFirst
1363 U = self.upperCase1
1364 return eval(self.actionList[action], locals())
1365
1366class Layout:
1367 """Class to layout the output.
1368 """
1369 #display width of hexdata+bitdata
1370 width = 25
1371 #general
1372 def __init__(self, stream):
1373 self.stream = stream
1374 self.bitPtr = self.width
1375
1376 def makeHexData(self, pos):
1377 """Produce hex dump of all data containing the bits
1378 from pos to stream.pos
1379 """
1380 firstAddress = pos+7>>3
1381 lastAddress = self.stream.pos+7>>3
1382 return ''.join(map('{:02x} '.format,
1383 self.stream.data[firstAddress:lastAddress]))
1384
1385 def formatBitData(self, pos, width1, width2=0):
1386 """Show formatted bit data:
1387 Bytes are separated by commas
1388 whole bytes are displayed in hex
1389 >>> Layout(olleke).formatBitData(6, 2, 16)
1390 '|00h|2Eh,|00'
1391 >>> Layout(olleke).formatBitData(4, 1, 0)
1392 '1'
1393 """
1394 result = []
1395 #make empty prefix code explicit
1396 if width1==0: result = ['()', ',']
1397 for width in width1, width2:
1398 #skip empty width2
1399 if width==0: continue
1400 #build result backwards in a list
1401 while width>0:
1402 availableBits = 8-(pos&7)
1403 if width<availableBits:
1404 #read partial byte, beginning nor ending at boundary
1405 data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1
1406 result.append('{:0{}b}'.format(data, width))
1407 elif availableBits<8:
1408 #read rest of byte, ending at boundary
1409 data = self.stream.data[pos>>3] >> (pos&7)
1410 result.append('|{:0{}b}'.format(data, availableBits))
1411 else:
1412 #read whole byte (in hex), beginning and ending at boundary
1413 data = self.stream.data[pos>>3]
1414 result.append('|{:02X}h'.format(data))
1415 width -= availableBits
1416 pos += availableBits
1417 #if width overshot from the availableBits subtraction, fix it
1418 pos += width
1419 #add comma to separate fields
1420 result.append(',')
1421 #concatenate pieces, reversed, skipping the last space
1422 return ''.join(result[-2::-1])
1423
1424 def readPrefixCode(self, alphabet):
1425 """give alphabet the prefix code that is read from the stream
1426 Called for the following alphabets, in this order:
1427 The alphabet in question must have a "logical" order,
1428 otherwise the assignment of symbols doesn't work.
1429 """
1430 mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name))
1431 if mode=='Complex':
1432 #for a complex code, numberOfSymbols means hskip
1433 self.readComplexCode(numberOfSymbols, alphabet)
1434 return alphabet
1435 else:
1436 table = []
1437 #Set table of lengths for mnemonic function
1438 lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1]
1439 #adjust mnemonic function of alphabet class
1440 def myMnemonic(index):
1441 return '{} bit{}: {}'.format(
1442 lengths[i],
1443 '' if lengths[i]==1 else 's',
1444 alphabet.__class__.mnemonic(alphabet, index)
1445 )
1446 alphabet.mnemonic = myMnemonic
1447 for i in range(numberOfSymbols):
1448 table.append(self.verboseRead(alphabet, skipExtra=True).index)
1449 #restore mnemonic
1450 del alphabet.mnemonic
1451 if numberOfSymbols==4:
1452 #read tree shape to redefine lengths
1453 lengths = self.verboseRead(TreeShapeAlhabet())
1454 #construct the alphabet prefix code
1455 alphabet.setLength(dict(zip(table, lengths)))
1456 return alphabet
1457
1458 def readComplexCode(self, hskip, alphabet):
1459 """Read complex code"""
1460 stream = self.stream
1461 #read the lengths for the length code
1462 lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:]
1463 codeLengths = {}
1464 total = 0
1465 lol = LengthOfLengthAlphabet('##'+alphabet.name)
1466 #lengthCode will be used for coding the lengths of the new code
1467 #we use it for display until now; definition comes below
1468 lengthCode = LengthAlphabet('#'+alphabet.name)
1469 lengthIter = iter(lengths)
1470 while total<32:
1471 newSymbol = next(lengthIter)
1472 lol.description = str(lengthCode[newSymbol])
1473 length = self.verboseRead(lol)
1474 if length:
1475 codeLengths[newSymbol] = length
1476 total += 32>>length
1477 if total>=32:
1478 break
1479 if total>32: raise ValueError("Stream format")
1480 #Now set the encoding of the lengthCode
1481 lengthCode.setLength(codeLengths)
1482 print("***** Lengths for {} will be coded as:".format(alphabet.name))
1483 lengthCode.showCode()
1484 #Now determine the symbol lengths with the lengthCode
1485 symbolLengths = {}
1486 total = 0
1487 lastLength = 8
1488 alphabetIter = iter(alphabet)
1489 while total<32768:
1490 #look ahead to see what is going to happen
1491 length = lengthCode.decodePeek(
1492 self.stream.peek(lengthCode.maxLength))[1].index
1493 #in every branch, set lengthCode.description to explanatory text
1494 #lengthCode calls format(symbol, extra) with this string
1495 if length==0:
1496 symbol = next(alphabetIter)
1497 lengthCode.description = 'symbol {} unused'.format(symbol)
1498 self.verboseRead(lengthCode)
1499 #unused symbol
1500 continue
1501 if length==16:
1502 lengthCode.description = \
1503 '{1}+3 symbols of length '+str(lastLength)
1504 extra = self.verboseRead(lengthCode)
1505 #scan series of 16s (repeat counts)
1506 #start with repeat count 2
1507 repeat = 2
1508 startSymbol = next(alphabetIter)
1509 endSymbol = next(alphabetIter)
1510 symbolLengths[startSymbol.index] = \
1511 symbolLengths[endSymbol.index] = lastLength
1512 #count the two just defined symbols
1513 total += 2*32768>>lastLength
1514 #note: loop may end because we're there
1515 #even if a 16 _appears_ to follow
1516 while True:
1517 #determine last symbol
1518 oldRepeat = repeat
1519 repeat = (repeat-2<<2)+extra+3
1520 #read as many symbols as repeat increased
1521 for i in range(oldRepeat, repeat):
1522 endSymbol = next(alphabetIter)
1523 symbolLengths[endSymbol.index] = lastLength
1524 #compute new total; it may be end of loop
1525 total += (repeat-oldRepeat)*32768>>lastLength
1526 if total>=32768: break
1527 #see if there is more to do
1528 length = lengthCode.decodePeek(
1529 self.stream.peek(lengthCode.maxLength))[1].index
1530 if length!=16: break
1531 lengthCode.description = 'total {}+{{1}} symbols'.format(
1532 (repeat-2<<2)+3)
1533 extra = self.verboseRead(lengthCode)
1534 elif length==17:
1535 #read, and show explanation
1536 lengthCode.description = '{1}+3 unused'
1537 extra = self.verboseRead(lengthCode)
1538 #scan series of 17s (groups of zero counts)
1539 #start with repeat count 2
1540 repeat = 2
1541 startSymbol = next(alphabetIter)
1542 endSymbol = next(alphabetIter)
1543 #note: loop will not end with total==32768,
1544 #since total doesn't change here
1545 while True:
1546 #determine last symbol
1547 oldRepeat = repeat
1548 repeat = (repeat-2<<3)+extra+3
1549 #read as many symbols as repeat increases
1550 for i in range(repeat-oldRepeat):
1551 endSymbol = next(alphabetIter)
1552 #see if there is more to do
1553 length = lengthCode.decodePeek(
1554 self.stream.peek(lengthCode.maxLength))[1].index
1555 if length!=17: break
1556 lengthCode.description = 'total {}+{{1}} unused'.format(
1557 (repeat-2<<3)+3)
1558 extra = self.verboseRead(lengthCode)
1559 else:
1560 symbol = next(alphabetIter)
1561 #double braces for format
1562 char = str(symbol)
1563 if char in '{}': char *= 2
1564 lengthCode.description = \
1565 'Length for {} is {{0.index}} bits'.format(char)
1566 #output is not needed (will be 0)
1567 self.verboseRead(lengthCode)
1568 symbolLengths[symbol.index] = length
1569 total += 32768>>length
1570 lastLength = length
1571 assert total==32768
1572 alphabet.setLength(symbolLengths)
1573 print('End of table. Prefix code '+alphabet.name+':')
1574 alphabet.showCode()
1575
1576 #stream
1577 def processStream(self):
1578 """Process a brotli stream.
1579 """
1580 print('addr hex{:{}s}binary context explanation'.format(
1581 '', self.width-10))
1582 print('Stream header'.center(60, '-'))
1583 self.windowSize = self.verboseRead(WindowSizeAlphabet())
1584 print('Metablock header'.center(60, '='))
1585 self.ISLAST = False
1586 self.output = bytearray()
1587 while not self.ISLAST:
1588 self.ISLAST = self.verboseRead(
1589 BoolCode('LAST', description="Last block"))
1590 if self.ISLAST:
1591 if self.verboseRead(
1592 BoolCode('EMPTY', description="Empty block")): break
1593 if self.metablockLength(): continue
1594 if not self.ISLAST and self.uncompressed(): continue
1595 print('Block type descriptors'.center(60, '-'))
1596 self.numberOfBlockTypes = {}
1597 self.currentBlockCounts = {}
1598 self.blockTypeCodes = {}
1599 self.blockCountCodes = {}
1600 for blockType in (L,I,D): self.blockType(blockType)
1601 print('Distance code parameters'.center(60, '-'))
1602 self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet())
1603 self.readLiteralContextModes()
1604 print('Context maps'.center(60, '-'))
1605 self.cmaps = {}
1606 #keep the number of each kind of prefix tree for the last loop
1607 numberOfTrees = {I: self.numberOfBlockTypes[I]}
1608 for blockType in (L,D):
1609 numberOfTrees[blockType] = self.contextMap(blockType)
1610 print('Prefix code lists'.center(60, '-'))
1611 self.prefixCodes = {}
1612 for blockType in (L,I,D):
1613 self.readPrefixArray(blockType, numberOfTrees[blockType])
1614 self.metablock()
1615
1616 #metablock header
1617 def verboseRead(self, alphabet, context='', skipExtra=False):
1618 """Read symbol and extra from stream and explain what happens.
1619 Returns the value of the symbol
1620 >>> olleke.pos = 0
1621 >>> l = Layout(olleke)
1622 >>> l.verboseRead(WindowSizeAlphabet())
1623 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
1624 4194288
1625 """
1626 #TODO 2: verbosity level, e.g. show only codes and maps in header
1627 stream = self.stream
1628 pos = stream.pos
1629 if skipExtra:
1630 length, symbol = alphabet.readTuple(stream)
1631 extraBits, extra = 0, None
1632 else:
1633 length, symbol, extraBits, extra = alphabet.readTupleAndExtra(
1634 stream)
1635 #fields: address, hex data, binary data, name of alphabet, explanation
1636 hexdata = self.makeHexData(pos)
1637 addressField = '{:04x}'.format(pos+7>>3) if hexdata else ''
1638 bitdata = self.formatBitData(pos, length, extraBits)
1639 #bitPtr moves bitdata so that the bytes are easier to read
1640 #jump back to right if a new byte starts
1641 if '|' in bitdata[1:]:
1642 #start over on the right side
1643 self.bitPtr = self.width
1644 fillWidth = self.bitPtr-(len(hexdata)+len(bitdata))
1645 if fillWidth<0: fillWidth = 0
1646 print('{:<5s} {:<{}s} {:7s} {}'.format(
1647 addressField,
1648 hexdata+' '*fillWidth+bitdata, self.width,
1649 context+alphabet.name,
1650 symbol if skipExtra else symbol.explanation(extra),
1651 ))
1652 #jump to the right if we started with a '|'
1653 #because we didn't jump before printing
1654 if bitdata.startswith('|'): self.bitPtr = self.width
1655 else: self.bitPtr -= len(bitdata)
1656 return symbol if skipExtra else symbol.value(extra)
1657
1658 def metablockLength(self):
1659 """Read MNIBBLES and meta block length;
1660 if empty block, skip block and return true.
1661 """
1662 self.MLEN = self.verboseRead(MetablockLengthAlphabet())
1663 if self.MLEN:
1664 return False
1665 #empty block; skip and return False
1666 self.verboseRead(ReservedAlphabet())
1667 MSKIP = self.verboseRead(SkipLengthAlphabet())
1668 self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
1669 self.stream.pos += 8*MSKIP
1670 print("Skipping to {:x}".format(self.stream.pos>>3))
1671 return True
1672
1673 def uncompressed(self):
1674 """If true, handle uncompressed data
1675 """
1676 ISUNCOMPRESSED = self.verboseRead(
1677 BoolCode('UNCMPR', description='Is uncompressed?'))
1678 if ISUNCOMPRESSED:
1679 self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
1680 print('Uncompressed data:')
1681 self.output += self.stream.readBytes(self.MLEN)
1682 print(outputFormatter(self.output[-self.MLEN:]))
1683 return ISUNCOMPRESSED
1684
1685 def blockType(self, kind):
1686 """Read block type switch descriptor for given kind of blockType."""
1687 NBLTYPES = self.verboseRead(TypeCountAlphabet(
1688 'BT#'+kind[0].upper(),
1689 description='{} block types'.format(kind),
1690 ))
1691 self.numberOfBlockTypes[kind] = NBLTYPES
1692 if NBLTYPES>=2:
1693 self.blockTypeCodes[kind] = self.readPrefixCode(
1694 BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES))
1695 self.blockCountCodes[kind] = self.readPrefixCode(
1696 BlockCountAlphabet('BC'+kind[0].upper()))
1697 blockCount = self.verboseRead(self.blockCountCodes[kind])
1698 else:
1699 blockCount = 1<<24
1700 self.currentBlockCounts[kind] = blockCount
1701
1702 def readLiteralContextModes(self):
1703 """Read literal context modes.
1704 LSB6: lower 6 bits of last char
1705 MSB6: upper 6 bits of last char
1706 UTF8: rougly dependent on categories:
1707 upper 4 bits depend on category of last char:
1708 control/whitespace/space/ punctuation/quote/%/open/close/
1709 comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant
1710 lower 2 bits depend on category of 2nd last char:
1711 space/punctuation/digit or upper/lowercase
1712 signed: hamming weight of last 2 chars
1713 """
1714 print('Context modes'.center(60, '-'))
1715 self.literalContextModes = []
1716 for i in range(self.numberOfBlockTypes[L]):
1717 self.literalContextModes.append(
1718 self.verboseRead(LiteralContextMode(number=i)))
1719
1720 def contextMap(self, kind):
1721 """Read context maps
1722 Returns the number of differnt values on the context map
1723 (In other words, the number of prefix trees)
1724 """
1725 NTREES = self.verboseRead(TypeCountAlphabet(
1726 kind[0].upper()+'T#',
1727 description='{} prefix trees'.format(kind)))
1728 mapSize = {L:64, D:4}[kind]
1729 if NTREES<2:
1730 self.cmaps[kind] = [0]*mapSize
1731 else:
1732 #read CMAPkind
1733 RLEMAX = self.verboseRead(RLEmaxAlphabet(
1734 'RLE#'+kind[0].upper(),
1735 description=kind+' context map'))
1736 alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX)
1737 cmapCode = self.readPrefixCode(alphabet)
1738 tableSize = mapSize*self.numberOfBlockTypes[kind]
1739 cmap = []
1740 while len(cmap)<tableSize:
1741 cmapCode.description = 'map {}, entry {}'.format(
1742 *divmod(len(cmap), mapSize))
1743 count, value = self.verboseRead(cmapCode)
1744 cmap.extend([value]*count)
1745 assert len(cmap)==tableSize
1746 IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF'))
1747 if IMTF:
1748 self.IMTF(cmap)
1749 if kind==L:
1750 print('Context maps for literal data:')
1751 for i in range(0, len(cmap), 64):
1752 print(*(
1753 ''.join(map(str, cmap[j:j+8]))
1754 for j in range(i, i+64, 8)
1755 ))
1756 else:
1757 print('Context map for distances:')
1758 print(*(
1759 ''.join(map('{:x}'.format, cmap[i:i+4]))
1760 for i in range(0, len(cmap), 4)
1761 ))
1762 self.cmaps[kind] = cmap
1763 return NTREES
1764
1765 @staticmethod
1766 def IMTF(v):
1767 """In place inverse move to front transform.
1768 """
1769 #mtf is initialized virtually with range(infinity)
1770 mtf = []
1771 for i, vi in enumerate(v):
1772 #get old value from mtf. If never seen, take virtual value
1773 try: value = mtf.pop(vi)
1774 except IndexError: value = vi
1775 #put value at front
1776 mtf.insert(0, value)
1777 #replace transformed value
1778 v[i] = value
1779
1780 def readPrefixArray(self, kind, numberOfTrees):
1781 """Read prefix code array"""
1782 prefixes = []
1783 for i in range(numberOfTrees):
1784 if kind==L: alphabet = LiteralAlphabet(i)
1785 elif kind==I: alphabet = InsertAndCopyAlphabet(i)
1786 elif kind==D: alphabet = DistanceAlphabet(
1787 i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT)
1788 self.readPrefixCode(alphabet)
1789 prefixes.append(alphabet)
1790 self.prefixCodes[kind] = prefixes
1791
1792 #metablock data
1793 def metablock(self):
1794 """Process the data.
1795 Relevant variables of self:
1796 numberOfBlockTypes[kind]: number of block types
1797 currentBlockTypes[kind]: current block types (=0)
1798 literalContextModes: the context modes for the literal block types
1799 currentBlockCounts[kind]: counters for block types
1800 blockTypeCodes[kind]: code for block type
1801 blockCountCodes[kind]: code for block count
1802 cmaps[kind]: the context maps (not for I)
1803 prefixCodes[kind][#]: the prefix codes
1804 lastDistances: the last four distances
1805 lastChars: the last two chars
1806 output: the result
1807 """
1808 print('Meta block contents'.center(60, '='))
1809 self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1}
1810 self.lastDistances = deque([17,16,11,4], maxlen=4)
1811 #the current context mode is for block type 0
1812 self.contextMode = ContextModeKeeper(self.literalContextModes[0])
1813 wordList = WordList()
1814
1815 #setup distance callback function
1816 def distanceCallback(symbol, extra):
1817 "callback function for displaying decoded distance"
1818 index, offset = symbol.value(extra)
1819 if index:
1820 #recent distance
1821 distance = self.lastDistances[-index]+offset
1822 return 'Distance: {}last{:+d}={}'.format(index, offset, distance)
1823 #absolute value
1824 if offset<=maxDistance:
1825 return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset)
1826 #word list value
1827 action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen])
1828 return '{}-{} gives word {},{} action {}'.format(
1829 offset, maxDistance, copyLen, word, action)
1830 for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback
1831
1832 blockLen = 0
1833 #there we go
1834 while blockLen<self.MLEN:
1835 #get insert&copy command
1836 litLen, copyLen, dist0Flag = self.verboseRead(
1837 self.prefixCodes[I][
1838 self.figureBlockType(I)])
1839 #literal data
1840 for i in range(litLen):
1841 bt = self.figureBlockType(L)
1842 cm = self.contextMode.getIndex()
1843 ct = self.cmaps[L][bt<<6|cm]
1844 char = self.verboseRead(
1845 self.prefixCodes[L][ct],
1846 context='{},{}='.format(bt,cm))
1847 self.contextMode.add(char)
1848 self.output.append(char)
1849 blockLen += litLen
1850 #check if we're done
1851 if blockLen>=self.MLEN: return
1852 #distance
1853 #distances are computed relative to output length, at most window size
1854 maxDistance = min(len(self.output), self.windowSize)
1855 if dist0Flag:
1856 distance = self.lastDistances[-1]
1857 else:
1858 bt = self.figureBlockType(D)
1859 cm = {2:0, 3:1, 4:2}.get(copyLen, 3)
1860 ct = self.cmaps[D][bt<<2|cm]
1861 index, offset = self.verboseRead(
1862 self.prefixCodes[D][ct],
1863 context='{},{}='.format(bt,cm))
1864 distance = self.lastDistances[-index]+offset if index else offset
1865 if index==1 and offset==0:
1866 #to make sure distance is not put in last distance list
1867 dist0Flag = True
1868 if distance<=maxDistance:
1869 #copy from output
1870 for i in range(
1871 maxDistance-distance,
1872 maxDistance-distance+copyLen):
1873 self.output.append(self.output[i])
1874 if not dist0Flag: self.lastDistances.append(distance)
1875 comment = 'Seen before'
1876 else:
1877 #fetch from wordlist
1878 newWord = wordList.word(copyLen, distance-maxDistance-1)
1879 self.output.extend(newWord)
1880 #adjust copyLen to reflect actual new data
1881 copyLen = len(newWord)
1882 comment = 'From wordlist'
1883 blockLen += copyLen
1884 print(' '*40,
1885 comment,
1886 ': "',
1887 outputFormatter(self.output[-copyLen:]),
1888 '"',
1889 sep='')
1890 self.contextMode.add(self.output[-2])
1891 self.contextMode.add(self.output[-1])
1892
1893 def figureBlockType(self, kind):
1894 counts, types = self.currentBlockCounts, self.currentBlockTypes
1895 if counts[kind]==0:
1896 newType = self.verboseRead(self.blockTypeCodes[kind])
1897 if newType==-2: newType = types['P'+kind]
1898 elif newType==-1:
1899 newType = (types[kind]+1)%self.numberOfBlockTypes[kind]
1900 types['P'+kind] = types[kind]
1901 types[kind] = newType
1902 counts[kind] = self.verboseRead(self.blockCountCodes[kind])
1903 counts[kind] -=1
1904 return types[kind]
1905
1906__test__ = {
1907'BitStream': """
1908 >>> bs = BitStream(b'Jurjen')
1909 >>> bs.readBytes(2)
1910 b'Ju'
1911 >>> bs.read(6) #r=01110010
1912 50
1913 >>> bs
1914 BitStream(pos=2:6)
1915 >>> bs.peek(5) #j=01101010
1916 9
1917 >>> bs.readBytes(2)
1918 Traceback (most recent call last):
1919 ...
1920 ValueError: readBytes: need byte boundary
1921 """,
1922
1923'Symbol': """
1924 >>> a=Symbol(MetablockLengthAlphabet(),5)
1925 >>> len(a)
1926 2
1927 >>> int(a)
1928 5
1929 >>> a.bitPattern()
1930 '01'
1931 >>> a.value(200000)
1932 200001
1933 >>> a.explanation(300000)
1934 'data length: 493e0h+1=300001'
1935 """,
1936
1937'RangeDecoder': """
1938 >>> a=RangeDecoder(bitLength=3)
1939 >>> len(a)
1940 8
1941 >>> a.name='t'
1942 >>> list(a)
1943 [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)]
1944 >>> a[2]
1945 Symbol(t, 2)
1946 >>> a.bitPattern(4)
1947 '100'
1948 >>> a.length(2)
1949 3
1950 >>> a.decodePeek(15)
1951 (3, Symbol(t, 7))
1952 >>>
1953
1954 """,
1955
1956'PrefixDecoder': """
1957 >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4})
1958 >>> len(a)
1959 4
1960 >>> a.name='t'
1961 >>> list(a)
1962 [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)]
1963 >>> a.decodePeek(22)
1964 (1, Symbol(t, 1))
1965 >>> a.decodePeek(27)
1966 (3, Symbol(t, 3))
1967 >>> a.length(1)
1968 1
1969 >>> a.length(4)
1970 3
1971 """,
1972
1973'Code': """
1974 >>> a=Code('t',alphabetSize=10)
1975 >>> len(a)
1976 10
1977 >>> a.showCode()
1978 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9
1979 >>> a.setLength({2:1,3:2,5:3,6:3})
1980 >>> a.showCode()
1981 0:2 01:3 011:5 111:6
1982 >>> len(a)
1983 4
1984 >>> def callback(i): return 'call{}back'.format(i)
1985 >>> a=Code('t',callback=callback,bitLength=3)
1986 >>> a[6].explanation()
1987 'call6back'
1988 """,
1989
1990'WithExtra': """
1991 >>> class A(WithExtra):
1992 ... extraTable = [0,1,1,2,2]
1993 >>> a=A('t',alphabetSize=5)
1994 >>> a[1]
1995 Symbol(t, 1)
1996 >>> a.extraBits(2)
1997 1
1998 >>> a.mnemonic(4)
1999 '4'
2000 >>> a.readTupleAndExtra(BitStream(b'\x5b'))
2001 (3, Symbol(t, 3), 2, 3)
2002 """,
2003
2004'BoolCode': """
2005 >>> BoolCode('test')[0].explanation()
2006 '0: False'
2007 """,
2008
2009'Enumerator': """
2010 >>> class A(Enumerator):
2011 ... extraTable = [0,1,1,2,2]
2012 ... value0=3
2013 >>> a=A(alphabetLength=5)
2014 >>> a.value(3)
2015 Traceback (most recent call last):
2016 ...
2017 TypeError: value() missing 1 required positional argument: 'extra'
2018 >>> a.explanation(3,4)
2019 'xx 011: 8-11; 8+4=12'
2020 """,
2021
2022'WindowSizeAlphabet': """
2023 >>> windowSizeAlphabet = WindowSizeAlphabet()
2024 >>> windowSizeAlphabet[0]
2025 Traceback (most recent call last):
2026 ...
2027 ValueError: No symbol WindowSizeAlphabet[0]
2028 >>> len(windowSizeAlphabet)
2029 16
2030 >>> windowSizeAlphabet[21]
2031 Symbol(WSIZE, 21)
2032 >>> windowSizeAlphabet[21].bitPattern()
2033 '1001'
2034 >>> windowSizeAlphabet[21].extraBits()
2035 0
2036 >>> windowSizeAlphabet[21].index
2037 21
2038 >>> windowSizeAlphabet[10].value()
2039 1008
2040 >>> windowSizeAlphabet[10].explanation()
2041 'windowsize=(1<<10)-16=1008'
2042 >>> windowSizeAlphabet.showCode()
2043 0:65520 1100001:16368 1110001:32752 0011:262128
2044 0000001:131056 0010001:None 1001:2097136 1011:4194288
2045 1000001:4080 1010001:8176 0101:524272 0111:1048560
2046 0100001:1008 0110001:2032 1101:8388592 1111:16777200
2047 """,
2048
2049'TypeCountAlphabet': """
2050 >>> typeCountAlphabet = TypeCountAlphabet(description='bananas')
2051 >>> len(typeCountAlphabet)
2052 9
2053 >>> typeCountAlphabet[3]
2054 Symbol(BT#, 3)
2055 >>> typeCountAlphabet[9]
2056 Traceback (most recent call last):
2057 ...
2058 ValueError: No symbol TypeCountAlphabet[9]
2059 >>> print(typeCountAlphabet[3])
2060 xx,0101
2061 >>> typeCountAlphabet[8].value(127)
2062 256
2063 >>> typeCountAlphabet[4].explanation(2)
2064 'xxx,0111: 11 bananas'
2065 >>> typeCountAlphabet[0].explanation()
2066 '0: 1 banana'
2067 """,
2068
2069'DistanceParamAlphabet': """
2070 >>> dpa = DistanceParamAlphabet()
2071 >>> dpa.showCode()
2072 00:PF0 01:PF1 10:PF2 11:PF3
2073 >>> dpa.readTupleAndExtra(BitStream(b'\\x29'))
2074 (2, Symbol(DIST, 1), 4, 10)
2075 >>> dpa.explanation(2, 5)
2076 '2 postfix bits and 0101<<2=20 direct codes'
2077 """,
2078
2079'LiteralAlphabet': """
2080 >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS
2081 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0
2082 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1
2083 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2
2084 ...
2085 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff
2086 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc
2087 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd
2088 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce
2089 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf
2090 """,
2091
2092'BlockCountAlphabet': """
2093 >>> bc=BlockCountAlphabet('BCL')
2094 >>> len(bc)
2095 26
2096 >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02')
2097 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2098 'Block count: xx 00000: 1-4; 1+2=3'
2099 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2100 'Block count: xxx 00110: 33-40; 33+0=33'
2101 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2102 'Block count: xxxxxx 10001: 305-368; 305+28=333'
2103 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2104 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333'
2105 """,
2106
2107'Layout': """
2108 >>> olleke.pos = 0
2109 >>> l = Layout(olleke)
2110 >>> l.verboseRead(WindowSizeAlphabet())
2111 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
2112 4194288
2113 >>> l.verboseRead(BoolCode('LAST', description="Last block"))
2114 1 LAST Last block: 1: True
2115 True
2116 >>> l.verboseRead(BoolCode('EMPTY', description="Empty block"))
2117 0 EMPTY Empty block: 0: False
2118 False
2119 >>> l.verboseRead(MetablockLengthAlphabet())
2120 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47
2121 47
2122 >>> olleke.pos = 76
2123 >>> l = Layout(olleke)
2124 >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True)
2125 000a 82 10|1100 D0 10[15*x]-3
2126 >>> x.explanation(0x86a3)
2127 '10[1000011010100011]-3: [0]+100000'
2128 """,
2129
2130'olleke': """
2131 >>> olleke.pos = 0
2132 >>> try: Layout(olleke).processStream()
2133 ... except NotImplementedError: pass
2134 ... #doctest: +REPORT_NDIFF
2135 addr hex binary context explanation
2136 -----------------------Stream header------------------------
2137 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
2138 ======================Metablock header======================
2139 1 LAST Last block: 1: True
2140 0 EMPTY Empty block: 0: False
2141 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47
2142 -------------------Block type descriptors-------------------
2143 0003 00 0 BT#L 0: 1 literal block type
2144 0 BT#I 0: 1 insert&copy block type
2145 0 BT#D 0: 1 distance block type
2146 ------------------Distance code parameters------------------
2147 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
2148 -----------------------Context modes------------------------
2149 10 LC0 Context mode for type 0: 2(UTF8)
2150 ------------------------Context maps------------------------
2151 0 LT# 0: 1 literal prefix tree
2152 0 DT# 0: 1 distance prefix tree
2153 ---------------------Prefix code lists----------------------
2154 10 PFX L0 is complex with lengths 3,4,0,5,17...
2155 0005 4f 1|0 ##L0 len 3: coded with 3 bits
2156 0111 ##L0 len 4: coded with 1 bits
2157 10 ##L0 unused: coded with 3 bits
2158 0006 d6 0|0 ##L0 len 5: skipped
2159 011 ##L0 zero xxx: coded with 2 bits
2160 ***** Lengths for L0 will be coded as:
2161 0:len 4 01:zero xxx 011:unused 111:len 3
2162 0007 95 1|11,01 #L0 7+3 unused
2163 0 #L0 Length for \\n is 4 bits
2164 001,01 #L0 1+3 unused
2165 0008 44 010,0|1 #L0 total 19+2 unused
2166 0 #L0 Length for " " is 4 bits
2167 0 #L0 Length for ! is 4 bits
2168 0009 cb 011,|01 #L0 3+3 unused
2169 |110,01 #L0 total 35+6 unused
2170 000a 82 0 #L0 Length for K is 4 bits
2171 000,01 #L0 0+3 unused
2172 0 #L0 Length for O is 4 bits
2173 000b 4d 01|1 #L0 symbol P unused
2174 011 #L0 symbol Q unused
2175 0 #L0 Length for R is 4 bits
2176 000c 88 000,|01 #L0 0+3 unused
2177 |100,01 #L0 total 11+4 unused
2178 000d b6 0 #L0 Length for b is 4 bits
2179 011 #L0 symbol c unused
2180 011 #L0 symbol d unused
2181 000e 27 11|1 #L0 Length for e is 3 bits
2182 010,01 #L0 2+3 unused
2183 |0 #L0 Length for k is 4 bits
2184 000f 1f 111 #L0 Length for l is 3 bits
2185 011 #L0 symbol m unused
2186 0 #L0 Length for n is 4 bits
2187 |0 #L0 Length for o is 4 bits
2188 0010 c1 000,01 #L0 0+3 unused
2189 0 #L0 Length for s is 4 bits
2190 0011 b4 0|11 #L0 symbol t unused
2191 0 #L0 Length for u is 4 bits
2192 End of table. Prefix code L0:
2193 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s
2194 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u
2195 11,01 PFX IC0 is simple with 4 code words
2196 0012 2a |2Ah|10 IC0 ? bits: I5C4
2197 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7
2198 0015 22 0010|111011 IC0 ? bits: I8+xC5
2199 0016 8c 001100|0010 IC0 ? bits: I0C14+xx
2200 0 SHAPE False: lengths 2,2,2,2
2201 0017 74 10,0|1 PFX D0 is simple with 3 code words
2202 0018 a6 0|01110 D0 1 bit: 2last-3
2203 010011 D0 2 bits: 11xx-3
2204 0019 aa 01010|1 D0 2 bits: 11xxx-3
2205 ====================Meta block contents=====================
2206 |1,01 IC0 Literal: 9, copy: 5
2207 001a 41 0001 0,0=L0 O
2208 100 0,48=L0 l
2209 001b a2 10|0 0,62=L0 l
2210 000 0,63=L0 e
2211 001c a1 1|101 0,59=L0 k
2212 000 0,63=L0 e
2213 |1010 0,59=L0 " "
2214 001d b5 0101 0,11=L0 b
2215 |1011 0,60=L0 o
2216 001e 24 0 0,3=D0 Distance: 2last-3=8
2217 Seen before: "lleke"
2218 0,10 IC0 Literal: 6, copy: 7
2219 |0010 0,59=L0 \\n
2220 001f 89 1001 0,7=L0 R
2221 000 0,52=L0 e
2222 0020 fa 010|1 0,58=L0 b
2223 1111 0,63=L0 u
2224 0021 eb 011|1 0,59=L0 s
2225 11,01 0,3=D0 Absolute value: 12 (pos 8)
2226 Seen before: "olleke\\n"
2227 0022 db 01,1|1 IC0 Literal: 0, copy: 15
2228 |110,11 0,3=D0 Absolute value: 27 (pos 0)
2229 Seen before: "Olleke bolleke\\n"
2230 0023 f8 00 IC0 Literal: 5, copy: 4
2231 1110 0,7=L0 K
2232 0024 2c 00|11 0,52=L0 n
2233 1011 0,62=L0 o
2234 0025 0d 1|00 0,59=L0 l
2235 0110 0,63=L0 !
2236 """,
2237
2238'file': """
2239 >>> try: Layout(BitStream(
2240 ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb')
2241 ... .read())).processStream()
2242 ... except NotImplementedError: pass
2243 addr hex binary context explanation
2244 -----------------------Stream header------------------------
2245 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
2246 ======================Metablock header======================
2247 1 LAST Last block: 1: True
2248 0 EMPTY Empty block: 0: False
2249 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20
2250 -------------------Block type descriptors-------------------
2251 0003 00 0 BT#L 0: 1 literal block type
2252 0 BT#I 0: 1 insert&copy block type
2253 0 BT#D 0: 1 distance block type
2254 ------------------Distance code parameters------------------
2255 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
2256 -----------------------Context modes------------------------
2257 10 LC0 Context mode for type 0: 2(UTF8)
2258 ------------------------Context maps------------------------
2259 0 LT# 0: 1 literal prefix tree
2260 0 DT# 0: 1 distance prefix tree
2261 ---------------------Prefix code lists----------------------
2262 0005 b0 0|1,01 PFX L0 is simple with 2 code words
2263 0006 b2 0|1011000 L0 1 bit: X
2264 0007 ea 0|1011001 L0 1 bit: Y
2265 01,01 PFX IC0 is simple with 2 code words
2266 0008 81 0000001|111 IC0 1 bit: I1C9&D=0
2267 0009 47 02 0|47h|1 IC0 1 bit: I1C9
2268 00,01 PFX D0 is simple with 1 code word
2269 000b 8a 010|000 D0 0 bits: 10x-3
2270 ====================Meta block contents=====================
2271 1 IC0 Literal: 1, copy: 9
2272 0 0,0=L0 X
2273 0,() 0,3=D0 Absolute value: 1 (pos 0)
2274 Seen before: "XXXXXXXXX"
2275 0 IC0 Literal: 1, copy: 9, same distance
2276 |1 0,54=L0 Y
2277 Seen before: "YYYYYYYYY"
2278 """,
2279
2280'XY': """
2281 >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream()
2282 ... except NotImplementedError: pass
2283 addr hex binary context explanation
2284 -----------------------Stream header------------------------
2285 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
2286 ======================Metablock header======================
2287 1 LAST Last block: 1: True
2288 0 EMPTY Empty block: 0: False
2289 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20
2290 -------------------Block type descriptors-------------------
2291 0003 00 0 BT#L 0: 1 literal block type
2292 0 BT#I 0: 1 insert&copy block type
2293 0 BT#D 0: 1 distance block type
2294 ------------------Distance code parameters------------------
2295 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
2296 -----------------------Context modes------------------------
2297 10 LC0 Context mode for type 0: 2(UTF8)
2298 ------------------------Context maps------------------------
2299 0 LT# 0: 1 literal prefix tree
2300 0 DT# 0: 1 distance prefix tree
2301 ---------------------Prefix code lists----------------------
2302 0005 b0 0|1,01 PFX L0 is simple with 2 code words
2303 0006 b2 0|1011000 L0 1 bit: X
2304 0007 82 0|1011001 L0 1 bit: Y
2305 00,01 PFX IC0 is simple with 1 code word
2306 0008 84 0000100|100 IC0 0 bits: I4C6&D=0
2307 0009 00 00,0|1 PFX D0 is simple with 1 code word
2308 000a e0 0|00000 D0 0 bits: last
2309 ====================Meta block contents=====================
2310 () IC0 Literal: 4, copy: 6, same distance
2311 0 0,0=L0 X
2312 0 0,52=L0 X
2313 0 0,54=L0 X
2314 0 0,54=L0 X
2315 Seen before: "XXXXXX"
2316 () IC0 Literal: 4, copy: 6, same distance
2317 1 0,54=L0 Y
2318 1 0,54=L0 Y
2319 |1 0,54=L0 Y
2320 000b 01 1 0,54=L0 Y
2321 Seen before: "YYYYYY"
2322 """,
2323
2324'empty': """
2325 >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream()
2326 ... except NotImplementedError: pass
2327 addr hex binary context explanation
2328 -----------------------Stream header------------------------
2329 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056
2330 ======================Metablock header======================
2331 |1 LAST Last block: 1: True
2332 0001 16 0 EMPTY Empty block: 0: False
2333 11 MLEN 11: empty block
2334 0 RSVD Reserved (must be zero)
2335 0002 00 000000|00,01 SKIP skip length: 0h+1=1
2336 |00 SKIP 2 bits ignored
2337 Skipping to 4
2338 """,
2339
2340}
2341
2342if __name__=='__main__':
2343 import sys
2344 if len(sys.argv)>1:
2345 l = Layout(BitStream(open(sys.argv[1],'rb').read()))
2346 l.processStream()
2347 else:
2348 sys.path.append("h:/Persoonlijk/bin")
2349 try:
2350 import brotli
2351 open('brotlidump.br', 'wb').write(
2352 brotli.compress(
2353 open('brotlidump.py', 'r').read()
2354 ))
2355 olleke = BitStream(brotli.compress(
2356 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!'))
2357 except ImportError: pass
2358 import doctest
2359 doctest.testmod(optionflags=doctest.REPORT_NDIFF
2360 #|doctest.FAIL_FAST
2361 )