jneb | f62cd2b | 2016-12-20 14:41:47 +0100 | [diff] [blame] | 1 | #!python3 |
| 2 | """Program to dump contents of Brotli compressed files showing the compression format. |
| 3 | Jurjen N.E. Bos, 2016. |
| 4 | I found the following issues with the Brotli format: |
| 5 | - The distance alphabet has size 16+(48<<POSTFIX), |
| 6 | but the last symbols are useless. |
| 7 | It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter. |
| 8 | - The block type code is useless if NBLTYPES==2, you would only need 1 symbol |
| 9 | anyway, so why don't you just switch to "the other" type? |
| 10 | """ |
| 11 | import struct |
| 12 | from operator import itemgetter, methodcaller |
| 13 | from itertools import accumulate, repeat |
| 14 | from collections import defaultdict, deque |
| 15 | from functools import partial |
| 16 | |
| 17 | class InvalidStream(Exception): pass |
| 18 | #lookup table |
| 19 | L, I, D = "literal", "insert©", "distance" |
| 20 | pL, pI, pD = 'P'+L, 'P'+I, 'P'+D |
| 21 | |
| 22 | def outputCharFormatter(c): |
| 23 | """Show character in readable format |
| 24 | """ |
| 25 | #TODO 2: allow hex only output |
| 26 | if 32<c<127: return chr(c) |
| 27 | elif c==10: return '\\n' |
| 28 | elif c==13: return '\\r' |
| 29 | elif c==32: return '" "' |
| 30 | else: return '\\x{:02x}'.format(c) |
| 31 | |
| 32 | def outputFormatter(s): |
| 33 | """Show string or char. |
| 34 | """ |
| 35 | result = '' |
| 36 | def formatSubString(s): |
| 37 | for c in s: |
| 38 | if c==32: yield ' ' |
| 39 | else: yield outputCharFormatter(c) |
| 40 | if len(result)<200: return ''.join(formatSubString(s)) |
| 41 | else: |
| 42 | return ''.join(formatSubString(s[:100]))+'...'+ \ |
| 43 | ''.join(formatSubString(s[-100:])) |
| 44 | |
| 45 | |
| 46 | class BitStream: |
| 47 | """Represent a bytes object. Can read bits and prefix codes the way |
| 48 | Brotli does. |
| 49 | """ |
| 50 | def __init__(self, byteString): |
| 51 | self.data = byteString |
| 52 | #position in bits: byte pos is pos>>3, bit pos is pos&7 |
| 53 | self.pos = 0 |
| 54 | |
| 55 | def __repr__(self): |
| 56 | """Representation |
| 57 | >>> olleke |
| 58 | BitStream(pos=0:0) |
| 59 | """ |
| 60 | return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7) |
| 61 | |
| 62 | def read(self, n): |
| 63 | """Read n bits from the stream and return as an integer. |
| 64 | Produces zero bits beyond the stream. |
| 65 | >>> olleke.data[0]==27 |
| 66 | True |
| 67 | >>> olleke.read(5) |
| 68 | 27 |
| 69 | |
| 70 | >>> olleke |
| 71 | BitStream(pos=0:5) |
| 72 | """ |
| 73 | value = self.peek(n) |
| 74 | self.pos += n |
| 75 | if self.pos>len(self.data)*8: |
| 76 | raise ValueError('Read past end of stream') |
| 77 | return value |
| 78 | |
| 79 | def peek(self, n): |
| 80 | """Peek an n bit integer from the stream without updating the pointer. |
| 81 | It is not an error to read beyond the end of the stream. |
| 82 | >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803 |
| 83 | True |
| 84 | >>> olleke.peek(15) |
| 85 | 11803 |
| 86 | >>> hex(olleke.peek(32)) |
| 87 | '0x2e1b' |
| 88 | """ |
| 89 | #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3] |
| 90 | #convert to int: int.from_bytes(..., 'little') |
| 91 | #shift out the bits from the first byte: >>(self.pos&7) |
| 92 | #mask unwanted bits: & (1<<n)-1 |
| 93 | return int.from_bytes( |
| 94 | self.data[self.pos>>3:self.pos+n+7>>3], |
| 95 | 'little')>>(self.pos&7) & (1<<n)-1 |
| 96 | |
| 97 | def readBytes(self, n): |
| 98 | """Read n bytes from the stream on a byte boundary. |
| 99 | """ |
| 100 | if self.pos&7: raise ValueError('readBytes: need byte boundary') |
| 101 | result = self.data[self.pos>>3:(self.pos>>3)+n] |
| 102 | self.pos += 8*n |
| 103 | return result |
| 104 | |
| 105 | #-----------------------Symbol------------------------------------------- |
| 106 | class Symbol: |
| 107 | """A symbol in a code. |
| 108 | Refers back to the code that contains it. |
| 109 | Index is the place in the alphabet of the symbol. |
| 110 | """ |
| 111 | __slots__ = 'code', 'index' |
| 112 | def __init__(self, code, value): |
| 113 | self.code = code |
| 114 | self.index = value |
| 115 | |
| 116 | def __repr__(self): |
| 117 | return 'Symbol({}, {})'.format(self.code.name, self.index) |
| 118 | |
| 119 | def __len__(self): |
| 120 | """Number of bits in the prefix notation of this symbol |
| 121 | """ |
| 122 | return self.code.length(self.index) |
| 123 | |
| 124 | def __int__(self): |
| 125 | return self.index |
| 126 | |
| 127 | #these routines call equivalent routine in Code class |
| 128 | def bitPattern(self): |
| 129 | """Value of the symbol in the stream |
| 130 | """ |
| 131 | return self.code.bitPattern(self.index) |
| 132 | |
| 133 | def extraBits(self): |
| 134 | """Number of extra bits to read for this symbol |
| 135 | """ |
| 136 | return self.code.extraBits(self.index) |
| 137 | |
| 138 | def __str__(self): |
| 139 | """Short descriptor of the symbol without extra bits. |
| 140 | """ |
| 141 | return self.code.mnemonic(self.index) |
| 142 | |
| 143 | #requiring optional extra bits, if self.code supports them |
| 144 | def value(self, extra=None): |
| 145 | """The value used for processing. Can be a tuple. |
| 146 | with optional extra bits |
| 147 | """ |
| 148 | if isinstance(self.code, WithExtra): |
| 149 | if not 0<=extra<1<<self.extraBits(): |
| 150 | raise ValueError("value: extra value doesn't fit in extraBits") |
| 151 | return self.code.value(self.index, extra) |
| 152 | if extra is not None: |
| 153 | raise ValueError('value: no extra bits for this code') |
| 154 | return self.code.value(self.index) |
| 155 | |
| 156 | def explanation(self, extra=None): |
| 157 | """Long explanation of the value from the numeric value |
| 158 | with optional extra bits |
| 159 | Used by Layout.verboseRead when printing the value |
| 160 | """ |
| 161 | if isinstance(self.code, WithExtra): |
| 162 | return self.code.callback(self, extra) |
| 163 | return self.code.callback(self) |
| 164 | |
| 165 | #========================Code definitions================================== |
| 166 | class RangeDecoder: |
| 167 | """A decoder for the Code class that assumes the symbols |
| 168 | are encoded consecutively in binary. |
| 169 | It all depends on the "alphabetSize" property. |
| 170 | The range runs from 0 to alphabetSize-1. |
| 171 | This is the default decoder. |
| 172 | """ |
| 173 | def __init__(self, *, alphabetSize=None, bitLength=None, **args): |
| 174 | if bitLength is not None: alphabetSize = 1<<bitLength |
| 175 | if alphabetSize is not None: |
| 176 | self.alphabetSize = alphabetSize |
| 177 | self.maxLength = (alphabetSize-1).bit_length() |
| 178 | |
| 179 | def __len__(self): |
| 180 | return self.alphabetSize |
| 181 | |
| 182 | def __iter__(self): |
| 183 | """Produce all symbols. |
| 184 | """ |
| 185 | return map(partial(Symbol, self), range(len(self))) |
| 186 | |
| 187 | def __getitem__(self, index): |
| 188 | if index>=self.alphabetSize: raise ValueError('index out of range') |
| 189 | return Symbol(self, index) |
| 190 | |
| 191 | def bitPattern(self, index): |
| 192 | return '{:0{}b}'.format(index, self.maxLength) |
| 193 | |
| 194 | def length(self, index): |
| 195 | """Encoding length of given symbol. |
| 196 | Does not depend on index in this case. |
| 197 | """ |
| 198 | return self.maxLength |
| 199 | |
| 200 | def decodePeek(self, data): |
| 201 | """Find which symbol index matches the given data (from peek, as a number) |
| 202 | and return the number of bits decoded. |
| 203 | Can also be used to figure out length of a symbol. |
| 204 | """ |
| 205 | return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1) |
| 206 | |
| 207 | class PrefixDecoder: |
| 208 | """A decoder for the Code class that uses a prefix code. |
| 209 | The code is determined by encoding: |
| 210 | encode[p] gives the index corresponding to bit pattern p. |
| 211 | Used setDecode(decodeTable) to switch the decoder from the default |
| 212 | to a prefix decoder, or pass decodeTable at init. |
| 213 | You can also use setLength(lengthTable) |
| 214 | to define the encoding from the lengths. |
| 215 | The set of symbol values does not need to be consecutive. |
| 216 | """ |
| 217 | def __init__(self, *, decodeTable=None, **args): |
| 218 | if decodeTable is not None: self.setDecode(decodeTable) |
| 219 | |
| 220 | def __len__(self): |
| 221 | return len(self.decodeTable) |
| 222 | |
| 223 | def __iter__(self): |
| 224 | def revBits(index): |
| 225 | return self.bitPattern(index)[::-1] |
| 226 | return ( |
| 227 | Symbol(self, index) |
| 228 | for index in sorted(self.decodeTable.values(), key=revBits) |
| 229 | ) |
| 230 | |
| 231 | def __getitem__(self, index): |
| 232 | if index not in self.lengthTable: |
| 233 | raise ValueError('No symbol {}[{}]'.format( |
| 234 | self.__class__.__name__, index)) |
| 235 | return Symbol(self, index) |
| 236 | |
| 237 | def bitPattern(self, index): |
| 238 | bits = next(b for (b,s) in self.decodeTable.items() if s==index) |
| 239 | return '{:0{}b}'.format(bits, self.length(index)) |
| 240 | |
| 241 | def length(self, index): |
| 242 | """Encoding length of given symbol. |
| 243 | """ |
| 244 | return self.lengthTable[index] |
| 245 | |
| 246 | def decodePeek(self, data): |
| 247 | """Find which symbol index matches the given data (from peek, as a number) |
| 248 | and return the number of bits decoded. |
| 249 | Can also be used to figure out length of a symbol. |
| 250 | """ |
| 251 | #do binary search for word length |
| 252 | #invariant: lo<=length<=hi |
| 253 | lo, hi = self.minLength, self.maxLength |
| 254 | while lo<=hi: |
| 255 | mid = lo+hi>>1 |
| 256 | #note lo<=mid<hi at this point |
| 257 | mask = (1<<mid)-1 |
| 258 | #lets see what happens if we guess length is mid |
| 259 | try: index = self.decodeTable[data&mask] |
| 260 | except KeyError: |
| 261 | #too many bits specified, reduce estimated length |
| 262 | hi = mid-1 |
| 263 | continue |
| 264 | #we found a symbol, but there could be a longer match |
| 265 | symbolLength = self.lengthTable[index] |
| 266 | if symbolLength<=mid: |
| 267 | #all bits match, symbol must be right |
| 268 | return symbolLength, Symbol(self, index) |
| 269 | #there must be more bits to match |
| 270 | lo = mid+1 |
| 271 | return lo, Symbol(self, index) |
| 272 | |
| 273 | #routine to set up the tables |
| 274 | def setDecode(self, decodeTable): |
| 275 | """Store decodeTable, |
| 276 | and compute lengthTable, minLength, maxLength from encodings. |
| 277 | """ |
| 278 | self.decodeTable = decodeTable |
| 279 | #set of symbols with unknown length |
| 280 | todo = set(decodeTable) |
| 281 | #bit size under investigation |
| 282 | maskLength = 0 |
| 283 | lengthTable = {} |
| 284 | while todo: |
| 285 | mask = (1<<maskLength)-1 |
| 286 | #split the encodings that we didn't find yet using b bits |
| 287 | splitSymbols = defaultdict(list) |
| 288 | for s in todo: splitSymbols[s&mask].append(s) |
| 289 | #unique encodings have a length of maskLength bits |
| 290 | #set length, and remove from todo list |
| 291 | for s,subset in splitSymbols.items(): |
| 292 | if len(subset)==1: |
| 293 | lengthTable[self.decodeTable[s]] = maskLength |
| 294 | todo.remove(s) |
| 295 | #now investigate with longer mask |
| 296 | maskLength +=1 |
| 297 | #save result |
| 298 | self.lengthTable = lengthTable |
| 299 | self.minLength = min(lengthTable.values()) |
| 300 | self.maxLength = max(lengthTable.values()) |
| 301 | self.switchToPrefix() |
| 302 | |
| 303 | def setLength(self, lengthTable): |
| 304 | """Given the bit pattern lengths for symbols given in lengthTable, |
| 305 | set decodeTable, minLength, maxLength |
| 306 | """ |
| 307 | self.lengthTable = lengthTable |
| 308 | self.minLength = min(lengthTable.values()) |
| 309 | self.maxLength = max(lengthTable.values()) |
| 310 | #compute the backwards codes first; then reverse them |
| 311 | #compute (backwards) first code for every separate lengths |
| 312 | nextCodes = [] |
| 313 | #build codes for each length, from right to left |
| 314 | code = 0 |
| 315 | for bits in range(self.maxLength+1): |
| 316 | code <<= 1 |
| 317 | nextCodes.append(code) |
| 318 | code += sum(x==bits for x in lengthTable.values()) |
| 319 | self.decodeTable = {} |
| 320 | #count codes for each length, and store reversed in the table |
| 321 | for symbol in sorted(lengthTable): |
| 322 | bits = lengthTable[symbol] |
| 323 | bitpattern = '{:0{}b}'.format(nextCodes[bits], bits) |
| 324 | self.decodeTable[int(bitpattern[::-1], 2)] = symbol |
| 325 | nextCodes[bits] += 1 |
| 326 | self.switchToPrefix() |
| 327 | |
| 328 | def switchToPrefix(self): |
| 329 | """This routine makes sure the prefix decoder is activated. |
| 330 | """ |
| 331 | self.mode = PrefixDecoder |
| 332 | |
| 333 | class Code(RangeDecoder, PrefixDecoder): |
| 334 | """An alphabet of symbols, that can be read from a stream. |
| 335 | If you use setDecode or setLength, you have a prefix code, |
| 336 | otherwise you have a range code. |
| 337 | Features: |
| 338 | code[index] produces symbol with given index |
| 339 | value(index): value of symbol |
| 340 | mnemonic(index): short description of symbol |
| 341 | explanation(index): show meaning of symbol, shown in Layout.verboseRead |
| 342 | iter(code): produce all symbols in some order |
| 343 | name: show as context in Layout.verboseRead |
| 344 | """ |
| 345 | name = '?' |
| 346 | #callback is a function that gets the symbol and the extra bits |
| 347 | #default callback calls explanation |
| 348 | def __init__(self, name=None, *, callback=None, description='', **args): |
| 349 | """Don't forget to set either alphabetSize or decodeTable |
| 350 | """ |
| 351 | #set name when provided, otherwise take class variable |
| 352 | if name is not None: self.name = name |
| 353 | if callback is not None: self.callback = callback |
| 354 | self.description = description |
| 355 | #mode switch |
| 356 | if 'bitLength' in args or 'alphabetSize' in args: |
| 357 | self.mode = RangeDecoder |
| 358 | RangeDecoder.__init__(self, **args) |
| 359 | elif 'decodeTable' in args: |
| 360 | self.mode = PrefixDecoder |
| 361 | PrefixDecoder.__init__(self, **args) |
| 362 | else: |
| 363 | super().__init__(**args) |
| 364 | |
| 365 | def __repr__(self): |
| 366 | return self.__class__.__name__+' '+self.name |
| 367 | |
| 368 | #the routines that get switched between RangeDecoder and PrefixDecoder |
| 369 | def __len__(self): return self.mode.__len__(self) |
| 370 | def __iter__(self): return self.mode.__iter__(self) |
| 371 | def __getitem__(self, index): return self.mode.__getitem__(self, index) |
| 372 | def bitPattern(self, index): return self.mode.bitPattern(self, index) |
| 373 | def length(self, index): return self.mode.length(self, index) |
| 374 | def decodePeek(self, data): return self.mode.decodePeek(self, data) |
| 375 | #general routines |
| 376 | def value(self, index, extra=None): |
| 377 | """Get value of symbol for computations. |
| 378 | Override where needed. |
| 379 | """ |
| 380 | if extra is not None: |
| 381 | raise ValueError('value: no extra for this symbol') |
| 382 | return index |
| 383 | |
| 384 | def mnemonic(self, index): |
| 385 | """Give mnemonic of symbol. |
| 386 | Override where needed. |
| 387 | """ |
| 388 | return str(self.value(index)) |
| 389 | |
| 390 | def callback(self, symbol): |
| 391 | return self.explanation(symbol.index) |
| 392 | |
| 393 | def explanation(self, index): |
| 394 | """Long explanation of the value from the numeric value |
| 395 | This is a default routine. |
| 396 | You can customize in three ways: |
| 397 | - set description to add some text |
| 398 | - override to get more control |
| 399 | - set callback to make it dependent on you local variables |
| 400 | """ |
| 401 | value = self.value(index) |
| 402 | return '{0}{1}: {2}'.format( |
| 403 | self.description and self.description+': ', |
| 404 | self.bitPattern(index), |
| 405 | value, |
| 406 | ) |
| 407 | |
| 408 | def extraBits(self, index): |
| 409 | return 0 |
| 410 | |
| 411 | #Routines that use the decode interface |
| 412 | def showCode(self, width=80): |
| 413 | """Show all words of the code in a nice format. |
| 414 | """ |
| 415 | #make table of all symbols with binary strings |
| 416 | symbolStrings = [ |
| 417 | (self.bitPattern(s.index), self.mnemonic(s.index)) |
| 418 | for s in self |
| 419 | ] |
| 420 | #determine column widths the way Lisp programmers do it |
| 421 | leftColWidth, rightColWidth = map(max, map( |
| 422 | map, |
| 423 | repeat(len), |
| 424 | zip(*symbolStrings) |
| 425 | )) |
| 426 | colwidth = leftColWidth+rightColWidth |
| 427 | columns = 81//(colwidth+2) |
| 428 | rows = -(-len(symbolStrings)//columns) |
| 429 | def justify(bs): |
| 430 | b,s = bs |
| 431 | return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth) |
| 432 | for i in range(rows): |
| 433 | print(' '.join(map(justify, symbolStrings[i::rows])).rstrip()) |
| 434 | |
| 435 | def readTuple(self, stream): |
| 436 | """Read symbol from stream. Returns symbol, length. |
| 437 | """ |
| 438 | length, symbol = self.decodePeek(stream.peek(self.maxLength)) |
| 439 | stream.pos += length |
| 440 | return length, symbol |
| 441 | |
| 442 | def readTupleAndExtra(self, stream): |
| 443 | return self.readTuple(stream)+(0, None) |
| 444 | |
| 445 | class WithExtra(Code): |
| 446 | """Extension for Code so that symbol may have extra bits associated. |
| 447 | If you supply an extraTable, you can use extraBits |
| 448 | You can define an extraTable, |
| 449 | which allows to call extraBits to get the number of extraBits. |
| 450 | Otherwise, you can supply extraBits yourself. |
| 451 | Routine readTupleAndExtra now reads the extra bits too. |
| 452 | Value probably needs to be overridden; see Enumerator. |
| 453 | Note: this does not give you an decodeTable. |
| 454 | """ |
| 455 | #redefine these if you don't want to use an extraTable |
| 456 | def extraBits(self, index): |
| 457 | """Get the number of extra bits for this symbol. |
| 458 | """ |
| 459 | return self.extraTable[index] |
| 460 | |
| 461 | def mnemonic(self, index): |
| 462 | """This value must be independent of extra. |
| 463 | """ |
| 464 | return str(index) |
| 465 | |
| 466 | def readTupleAndExtra(self, stream): |
| 467 | """Read symbol and extrabits from stream. |
| 468 | Returns symbol length, symbol, extraBits, extra |
| 469 | >>> olleke.pos = 6 |
| 470 | >>> MetablockLengthAlphabet().readTupleAndExtra(olleke) |
| 471 | (2, Symbol(MLEN, 4), 16, 46) |
| 472 | """ |
| 473 | length, symbol = self.decodePeek(stream.peek(self.maxLength)) |
| 474 | stream.pos += length |
| 475 | extraBits = self.extraBits(symbol.index) |
| 476 | return length, symbol, extraBits, stream.read(extraBits) |
| 477 | |
| 478 | def explanation(self, index, extra=None): |
| 479 | """Expanded version of Code.explanation supporting extra bits. |
| 480 | If you don't supply extra, it is not mentioned. |
| 481 | """ |
| 482 | extraBits = 0 if extra is None else self.extraBits(index) |
| 483 | if not hasattr(self, 'extraTable'): |
| 484 | formatString = '{0}{3}' |
| 485 | lo = hi = value = self.value(index, extra) |
| 486 | elif extraBits==0: |
| 487 | formatString = '{0}{2}: {3}' |
| 488 | lo, hi = self.span(index) |
| 489 | value = lo |
| 490 | else: |
| 491 | formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}' |
| 492 | lo, hi = self.span(index) |
| 493 | value = lo+extra |
| 494 | return formatString.format( |
| 495 | self.description and self.description+': ', |
| 496 | 'x'*extraBits, |
| 497 | self.bitPattern(index), |
| 498 | lo, hi, |
| 499 | extra, |
| 500 | value, |
| 501 | ) |
| 502 | |
| 503 | def callback(self, symbol, extra): |
| 504 | return self.explanation(symbol.index, extra) |
| 505 | |
| 506 | class BoolCode(Code): |
| 507 | """Same as Code(bitLength=1), but shows a boolean. |
| 508 | """ |
| 509 | def __init__(self, name=None, **args): |
| 510 | super().__init__(name, bitLength=1, **args) |
| 511 | |
| 512 | def value(self, index, extra=None): |
| 513 | return bool(super().value(index, extra)) |
| 514 | |
| 515 | class Enumerator(WithExtra): |
| 516 | """Code that is defined by the ExtraTable. |
| 517 | extraTable is a class variable that contains |
| 518 | the extraBits of the symbols from 0 |
| 519 | value0 contains the value of symbol 0 |
| 520 | encodings is not neccessary, but allowed. |
| 521 | Note: place for FixedCode to make sure extraBits works |
| 522 | """ |
| 523 | def __init__(self, name=None, **args): |
| 524 | #if there is no decodeTable to determine length, compute it ourselves |
| 525 | if 'decodeTable' not in args: |
| 526 | args['alphabetSize'] = len(self.extraTable) |
| 527 | super().__init__(name, **args) |
| 528 | |
| 529 | def __len__(self): |
| 530 | return len(self.extraTable) |
| 531 | |
| 532 | def __getitem__(self, index): |
| 533 | """Faster than PrefixDecoder |
| 534 | """ |
| 535 | if index>=len(self.extraTable): |
| 536 | raise ValueError("No symbol {}[{}]".format( |
| 537 | self.__class__.__name__, index)) |
| 538 | return Symbol(self, index) |
| 539 | |
| 540 | def value(self, index, extra): |
| 541 | """Override if you don't define value0 and extraTable |
| 542 | """ |
| 543 | lower, upper = self.span(index) |
| 544 | value = lower+(extra or 0) |
| 545 | if value>upper: |
| 546 | raise ValueError('value: extra out of range') |
| 547 | return value |
| 548 | |
| 549 | def span(self, index): |
| 550 | """Give the range of possible values in a tuple |
| 551 | Useful for mnemonic and explanation |
| 552 | """ |
| 553 | lower = self.value0+sum(1<<x for x in self.extraTable[:index]) |
| 554 | upper = lower+(1<<self.extraTable[index]) |
| 555 | return lower, upper-1 |
| 556 | |
| 557 | #======================Code subclasses====================================== |
| 558 | #Alphabets used in the metablock header---------------------------------- |
| 559 | #For prefix codes |
| 560 | class PrefixCodeHeader(WithExtra): |
| 561 | """Header of prefix codes. |
| 562 | """ |
| 563 | def __init__(self, codename): |
| 564 | super().__init__('PFX', bitLength=2) |
| 565 | #this is the name of the code that it describes |
| 566 | self.codename = codename |
| 567 | |
| 568 | def extraBits(self, index): |
| 569 | return 2 if index==1 else 0 |
| 570 | |
| 571 | def value(self, index, extra): |
| 572 | """Returns ('Simple', #codewords) or ('Complex', HSKIP) |
| 573 | """ |
| 574 | if index==1: |
| 575 | if extra>3: |
| 576 | raise ValueError('value: extra out of range') |
| 577 | return 'Simple', extra+1 |
| 578 | if extra: |
| 579 | raise ValueError('value: extra out of range') |
| 580 | return 'Complex', index |
| 581 | |
| 582 | def explanation(self, index, extra): |
| 583 | if index==1: |
| 584 | return '{} is simple with {} code word{}'.format( |
| 585 | self.codename, extra+1, 's' if extra else '') |
| 586 | lengths = [1, 2, 3, 4, 0, 5, 17, 6] |
| 587 | return '{} is complex with lengths {}...'.format( |
| 588 | self.codename, |
| 589 | ','.join( |
| 590 | map(str, lengths[index:index+5])) |
| 591 | ) |
| 592 | |
| 593 | class TreeShapeAlhabet(BoolCode): |
| 594 | """The bit used to indicate if four word code is "deep" or "wide" |
| 595 | """ |
| 596 | name = 'SHAPE' |
| 597 | def value(self, index): |
| 598 | return [(2,2,2,2), (1,2,3,3)][index] |
| 599 | |
| 600 | def explanation(self, index): |
| 601 | return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index)) |
| 602 | |
| 603 | class LengthOfLengthAlphabet(Code): |
| 604 | """For use in decoding complex code descriptors. |
| 605 | >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('') |
| 606 | >>> print(lengthOfLengthAlphabet[2]) |
| 607 | coded with 2 bits |
| 608 | >>> len(lengthOfLengthAlphabet[0]) |
| 609 | 2 |
| 610 | >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)] |
| 611 | [2, 4, 3, 2, 2, 4] |
| 612 | >>> lengthOfLengthAlphabet.showCode() |
| 613 | 00:skipped 01:coded with 4 bits 0111:coded with 1 bits |
| 614 | 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits |
| 615 | """ |
| 616 | decodeTable = { |
| 617 | 0b00:0, 0b10:3, |
| 618 | 0b0111:1, 0b01:4, |
| 619 | 0b011:2, 0b1111:5, |
| 620 | } |
| 621 | |
| 622 | def __init__(self, name=None, **args): |
| 623 | super().__init__(name, decodeTable=self.decodeTable, **args) |
| 624 | |
| 625 | def mnemonic(self, index): |
| 626 | if index==0: return 'skipped' |
| 627 | return 'coded with {} bits'.format(index) |
| 628 | |
| 629 | def explanation(self, index, extra=None): |
| 630 | return self.description+': '+self.mnemonic(index) |
| 631 | |
| 632 | class LengthAlphabet(WithExtra): |
| 633 | """Length of symbols |
| 634 | Used during construction of a code. |
| 635 | """ |
| 636 | def __init__(self, name): |
| 637 | super().__init__(name, alphabetSize=18) |
| 638 | |
| 639 | def extraBits(self, index): |
| 640 | return {16:2, 17:3}.get(index, 0) |
| 641 | |
| 642 | def mnemonic(self, index): |
| 643 | if index==0: return 'unused' |
| 644 | elif index==16: return 'rep xx' |
| 645 | elif index==17: return 'zero xxx' |
| 646 | else: return 'len {}'.format(index) |
| 647 | |
| 648 | def explanation(self, index, extra): |
| 649 | return self.description.format(self[index], extra) |
| 650 | |
| 651 | def value(self, index, extra): |
| 652 | #the caller got the length already, so extra is enough |
| 653 | return extra |
| 654 | |
| 655 | #Stream header |
| 656 | class WindowSizeAlphabet(Code): |
| 657 | """The alphabet used for window size in the stream header. |
| 658 | >>> WindowSizeAlphabet()[10].explanation() |
| 659 | 'windowsize=(1<<10)-16=1008' |
| 660 | """ |
| 661 | decodeTable = { |
| 662 | 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22, |
| 663 | 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23, |
| 664 | 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24, |
| 665 | 0b1010001: 13, 0b0000001: 17, 0b1001: 21, |
| 666 | 0b0010001: None, |
| 667 | } |
| 668 | |
| 669 | name = 'WSIZE' |
| 670 | |
| 671 | def __init__(self, name=None): |
| 672 | super().__init__(name, decodeTable=self.decodeTable) |
| 673 | |
| 674 | def value(self, index): |
| 675 | #missing value gives index None |
| 676 | if index is None: return None |
| 677 | return (1<<index)-16 |
| 678 | |
| 679 | def explanation(self, index): |
| 680 | return 'windowsize=(1<<{})-16={}'.format( |
| 681 | index, (1<<index)-16) |
| 682 | |
| 683 | #Metablock |
| 684 | class MetablockLengthAlphabet(WithExtra): |
| 685 | """Used for the meta block length; |
| 686 | also indicates a block with no data |
| 687 | >>> metablockLengthAlphabet = MetablockLengthAlphabet() |
| 688 | >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0]) |
| 689 | Symbol(MLEN, 0) |
| 690 | 'empty' |
| 691 | >>> metablockLengthAlphabet[3] |
| 692 | Traceback (most recent call last): |
| 693 | ... |
| 694 | ValueError: No symbol MetablockLengthAlphabet[3] |
| 695 | >>> print(metablockLengthAlphabet[4]) |
| 696 | hhhh00 |
| 697 | >>> metablockLengthAlphabet[4].value(0x1000) |
| 698 | 4097 |
| 699 | >>> metablockLengthAlphabet[5].value(0x1000) |
| 700 | Traceback (most recent call last): |
| 701 | ... |
| 702 | InvalidStream: Zeros in high nibble of MLEN |
| 703 | >>> metablockLengthAlphabet[5].explanation(0x12345) |
| 704 | 'data length: 12345h+1=74566' |
| 705 | >>> metablockLengthAlphabet.showCode() |
| 706 | 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty |
| 707 | """ |
| 708 | decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6} |
| 709 | |
| 710 | name = 'MLEN' |
| 711 | def __init__(self, name=None): |
| 712 | super().__init__(name, decodeTable=self.decodeTable) |
| 713 | |
| 714 | def extraBits(self, index): |
| 715 | return index*4 |
| 716 | |
| 717 | def mnemonic(self, index): |
| 718 | if index==0: return 'empty' |
| 719 | return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) |
| 720 | |
| 721 | def value(self, index, extra): |
| 722 | extraBits = self.extraBits(index) |
| 723 | if not 0<=extra<1<<extraBits: |
| 724 | raise ValueError('value: extra out of range') |
| 725 | if index==0: return 0 |
| 726 | if index>4 and extra>>extraBits-4==0: raise InvalidStream( |
| 727 | 'Zeros in high nibble of MLEN') |
| 728 | return extra+1 |
| 729 | |
| 730 | def explanation(self, index, extra): |
| 731 | if index==0: return '11: empty block' |
| 732 | extraBits = self.extraBits(index) |
| 733 | return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1) |
| 734 | |
| 735 | |
| 736 | class ReservedAlphabet(BoolCode): |
| 737 | """The reserved bit that must be zero. |
| 738 | """ |
| 739 | name = 'RSVD' |
| 740 | def value(self, index): |
| 741 | if index: raise ValueError('Reserved bit is not zero') |
| 742 | |
| 743 | def explanation(self, index): |
| 744 | return 'Reserved (must be zero)' |
| 745 | |
| 746 | class FillerAlphabet(Code): |
| 747 | def __init__(self, *, streamPos): |
| 748 | super().__init__('SKIP', bitLength=(-streamPos)&7) |
| 749 | |
| 750 | def explanation(self, index): |
| 751 | return '{} bit{} ignored'.format( |
| 752 | self.length(index), |
| 753 | '' if self.length(index)==1 else 's', |
| 754 | ) |
| 755 | |
| 756 | class SkipLengthAlphabet(WithExtra): |
| 757 | """Used for the skip length in an empty metablock |
| 758 | >>> skipLengthAlphabet = SkipLengthAlphabet() |
| 759 | >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0]) |
| 760 | Symbol(SKIP, 0) |
| 761 | 'empty' |
| 762 | >>> skipLengthAlphabet[4] |
| 763 | Traceback (most recent call last): |
| 764 | ... |
| 765 | ValueError: index out of range |
| 766 | >>> print(skipLengthAlphabet[3]) |
| 767 | hhhhhh11 |
| 768 | >>> skipLengthAlphabet[2].value(0x1000) |
| 769 | 4097 |
| 770 | >>> skipLengthAlphabet[3].value(0x1000) |
| 771 | Traceback (most recent call last): |
| 772 | ... |
| 773 | InvalidStream: Zeros in high byte of SKIPBYTES |
| 774 | >>> skipLengthAlphabet[3].explanation(0x12345) |
| 775 | 'skip length: 12345h+1=74566' |
| 776 | >>> skipLengthAlphabet.showCode() |
| 777 | 00:empty 01:hh01 10:hhhh10 11:hhhhhh11 |
| 778 | """ |
| 779 | def __init__(self): |
| 780 | super().__init__('SKIP', bitLength=2) |
| 781 | |
| 782 | def extraBits(self, index): |
| 783 | return index*8 |
| 784 | |
| 785 | def mnemonic(self, index): |
| 786 | if index==0: return 'empty' |
| 787 | return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) |
| 788 | |
| 789 | def value(self, index, extra): |
| 790 | extraBits = self.extraBits(index) |
| 791 | if not 0<=extra<1<<extraBits: |
| 792 | raise ValueError('value: extra out of range') |
| 793 | if index==0: return 0 |
| 794 | if index>1 and extra>>extraBits-8==0: |
| 795 | raise InvalidStream('Zeros in high byte of SKIPBYTES') |
| 796 | return extra+1 |
| 797 | |
| 798 | def explanation(self, index, extra): |
| 799 | if index==0: return '00: no skip' |
| 800 | extraBits = self.extraBits(index) |
| 801 | return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1) |
| 802 | |
| 803 | |
| 804 | class TypeCountAlphabet(Enumerator): |
| 805 | """Used for giving block type counts and tree counts. |
| 806 | >>> TypeCountAlphabet(description='').showCode() |
| 807 | 0:0 0101:xx,0101 1011:xxxxx,1011 |
| 808 | 0001:0001 1101:xxxxxx,1101 0111:xxx,0111 |
| 809 | 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111 |
| 810 | """ |
| 811 | decodeTable = { |
| 812 | 0b0: 0, 0b1001: 5, |
| 813 | 0b0001: 1, 0b1011: 6, |
| 814 | 0b0011: 2, 0b1101: 7, |
| 815 | 0b0101: 3, 0b1111: 8, |
| 816 | 0b0111: 4, |
| 817 | } |
| 818 | |
| 819 | value0 = 1 |
| 820 | extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7] |
| 821 | name = 'BT#' |
| 822 | |
| 823 | def __init__(self, name=None, *, description): |
| 824 | super().__init__( |
| 825 | name, |
| 826 | decodeTable=self.decodeTable, |
| 827 | description=description) |
| 828 | |
| 829 | def mnemonic(self, index): |
| 830 | if index==0: return '0' |
| 831 | if index==1: return '0001' |
| 832 | return 'x'*(self.extraBits(index))+','+self.bitPattern(index) |
| 833 | |
| 834 | def explanation(self, index, extra): |
| 835 | value = self.value(index, extra) |
| 836 | description = self.description |
| 837 | if value==1: description = description[:-1] |
| 838 | return '{}: {} {}'.format( |
| 839 | self.mnemonic(index), |
| 840 | value, |
| 841 | description) |
| 842 | |
| 843 | class BlockTypeAlphabet(Code): |
| 844 | """The block types; this code works for all three kinds. |
| 845 | >>> b = BlockTypeAlphabet('T', NBLTYPES=5) |
| 846 | >>> print(*(x for x in b)) |
| 847 | prev +1 #0 #1 #2 #3 #4 |
| 848 | """ |
| 849 | def __init__(self, name, NBLTYPES, **args): |
| 850 | super().__init__(name, alphabetSize=NBLTYPES+2, **args) |
| 851 | self.NBLTYPES = NBLTYPES |
| 852 | |
| 853 | def mnemonic(self, index): |
| 854 | if index==0: return 'prev' |
| 855 | elif index==1: return '+1' |
| 856 | else: return '#'+str(index-2) |
| 857 | |
| 858 | def value(self, index): |
| 859 | return index-2 |
| 860 | |
| 861 | def explanation(self, index): |
| 862 | if index==0: return '0: previous' |
| 863 | elif index==1: return '1: increment' |
| 864 | else: return 'Set block type to: '+str(index-2) |
| 865 | |
| 866 | class BlockCountAlphabet(Enumerator): |
| 867 | """Block counts |
| 868 | >>> b = BlockCountAlphabet('L') |
| 869 | >>> print(b[25]) |
| 870 | [24*x]: BC16625-16793840 |
| 871 | """ |
| 872 | |
| 873 | value0 = 1 |
| 874 | extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24] |
| 875 | def __init__(self, name, **args): |
| 876 | super().__init__(name, alphabetSize=26, **args) |
| 877 | |
| 878 | def mnemonic(self, index): |
| 879 | extraBits = self.extraBits(index) |
| 880 | return '{}: BC{}-{}'.format( |
| 881 | 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits), |
| 882 | *self.span(index)) |
| 883 | |
| 884 | def explanation(self, index, extra): |
| 885 | return 'Block count: '+super().explanation(index, extra) |
| 886 | |
| 887 | class DistanceParamAlphabet(WithExtra): |
| 888 | """The distance parameters NPOSTFIX and NDIRECT. |
| 889 | Although these are treated as two in the description, this is easier. |
| 890 | """ |
| 891 | def __init__(self): |
| 892 | super().__init__('DIST', bitLength=2) |
| 893 | |
| 894 | def extraBits(self, index): |
| 895 | return 4 |
| 896 | |
| 897 | def value(self, index, extra): |
| 898 | """Returns NPOSTFIX and NDIRECT<<NPOSTFIX |
| 899 | """ |
| 900 | if extra>15: |
| 901 | raise ValueError('value: extra out of range') |
| 902 | return index, extra<<index |
| 903 | |
| 904 | def explanation(self, index, extra): |
| 905 | return '{} postfix bits and {:04b}<<{}={} direct codes'.format( |
| 906 | index, extra, index, extra<<index) |
| 907 | |
| 908 | def mnemonic(self, index): |
| 909 | return 'PF'+str(index) |
| 910 | |
| 911 | class LiteralContextMode(Code): |
| 912 | """For the literal context modes. |
| 913 | >>> LiteralContextMode().showCode() |
| 914 | 00:LSB6 01:MSB6 10:UTF8 11:Signed |
| 915 | >>> LiteralContextMode().explanation(2) |
| 916 | 'Context mode for type 9: 2(UTF8)' |
| 917 | """ |
| 918 | |
| 919 | def __init__(self, *, number=9): |
| 920 | super().__init__('LC'+str(number), bitLength=2) |
| 921 | self.number = number |
| 922 | |
| 923 | def mnemonic(self, index): |
| 924 | return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index] |
| 925 | |
| 926 | def explanation(self, index): |
| 927 | return 'Context mode for type {}: {}({})'.format( |
| 928 | self.number, |
| 929 | index, |
| 930 | self.mnemonic(index)) |
| 931 | |
| 932 | class RLEmaxAlphabet(Enumerator): |
| 933 | """Used for describing the run length encoding used for describing context maps. |
| 934 | >>> RLEmaxAlphabet().showCode() |
| 935 | 0:1 1:more |
| 936 | """ |
| 937 | value0 = 0 |
| 938 | extraTable = [0, 4] |
| 939 | name = 'RLE#' |
| 940 | |
| 941 | def mnemonic(self, index): |
| 942 | return ['1', 'more'][index] |
| 943 | |
| 944 | def explanation(self, index, extra): |
| 945 | description = self.description and self.description+': ' |
| 946 | if index==0: return description+'No RLE coding' |
| 947 | return '{}xxxx 1: RLEMAX={}'.format(description, extra+1) |
| 948 | |
| 949 | class TreeAlphabet(WithExtra): |
| 950 | """The alphabet to enumerate entries (called trees) in the context map. |
| 951 | parameters are RLEMAX and NTREES |
| 952 | >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5) |
| 953 | >>> len(t) |
| 954 | 8 |
| 955 | >>> print(t[2]) |
| 956 | xx+4 zeroes |
| 957 | >>> t[3].explanation(2) |
| 958 | '8+010=10 zeroes' |
| 959 | >>> t[0].value(0) |
| 960 | (1, 0) |
| 961 | """ |
| 962 | name = 'CMI' |
| 963 | def __init__(self, name=None, *, RLEMAX, NTREES, **args): |
| 964 | super().__init__(name, alphabetSize=RLEMAX+NTREES, **args) |
| 965 | self.RLEMAX = RLEMAX |
| 966 | self.NTREES = NTREES |
| 967 | |
| 968 | def extraBits(self, index): |
| 969 | if 0<index<=self.RLEMAX: return index |
| 970 | return 0 |
| 971 | |
| 972 | def mnemonic(self, index): |
| 973 | if index==0: return 'map #0' |
| 974 | if index<=self.RLEMAX: |
| 975 | return '{}+{} zeroes'.format('x'*index, 1<<index) |
| 976 | return 'map #{}'.format(index-self.RLEMAX) |
| 977 | |
| 978 | def value(self, index, extra): |
| 979 | """Give count and value.""" |
| 980 | index = index |
| 981 | if index==0: return 1, 0 |
| 982 | if index<=self.RLEMAX: return (1<<index)+extra, 0 |
| 983 | return 1, index-self.RLEMAX |
| 984 | |
| 985 | def explanation(self, index, extra): |
| 986 | description = self.description and self.description+': ' |
| 987 | if index==0: return description+'map #0' |
| 988 | if index<=self.RLEMAX: |
| 989 | return '{}+{:0{}b}={} zeroes'.format( |
| 990 | (1<<index), |
| 991 | extra, self.extraBits(index), |
| 992 | (1<<index)+extra) |
| 993 | return '{}map #{}-{}={}'.format( |
| 994 | description, |
| 995 | index, self.RLEMAX, index-self.RLEMAX) |
| 996 | |
| 997 | #Prefix alphabets for the data stream---------------------------------- |
| 998 | class LiteralAlphabet(Code): |
| 999 | """Alphabet of symbols. |
| 1000 | """ |
| 1001 | minLength = maxLength = 8 |
| 1002 | def __init__(self, number): |
| 1003 | super().__init__('L'+str(number), alphabetSize=1<<8) |
| 1004 | |
| 1005 | def mnemonic(self, index): |
| 1006 | return outputCharFormatter(index) |
| 1007 | |
| 1008 | def value(self, index, extra=None): |
| 1009 | return index |
| 1010 | |
| 1011 | def explanation(self, index, extra=None): |
| 1012 | return self.mnemonic(index) |
| 1013 | |
| 1014 | class InsertLengthAlphabet(Enumerator): |
| 1015 | """Intern code for insert counts |
| 1016 | """ |
| 1017 | value0 = 0 |
| 1018 | extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24] |
| 1019 | |
| 1020 | class CopyLengthAlphabet(Enumerator): |
| 1021 | value0 = 2 |
| 1022 | extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24] |
| 1023 | |
| 1024 | class InsertAndCopyAlphabet(WithExtra): |
| 1025 | """The insert and copy code |
| 1026 | >>> for x in range(0,704,704//13): |
| 1027 | ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x]) |
| 1028 | 0 I0C2&D=0 |
| 1029 | 110110 I6+xC8&D=0 |
| 1030 | 1101100 I5C22+xxx&D=0 |
| 1031 | 10100010 I4C4 |
| 1032 | 11011000 I3C10+x |
| 1033 | 100001110 I14+xxC8 |
| 1034 | 101000100 I10+xxC22+xxx |
| 1035 | 101111010 I98+xxxxxC14+xx |
| 1036 | 110110000 I6+xC70+xxxxx |
| 1037 | 111100110 I1090+[10*x]C8 |
| 1038 | 1000011100 I26+xxxC326+[8*x] |
| 1039 | 1001010010 I322+[8*x]C14+xx |
| 1040 | 1010001000 I194+[7*x]C70+xxxxx |
| 1041 | 1010111110 I22594+[24*x]C1094+[10*x] |
| 1042 | """ |
| 1043 | insertLengthAlphabet = InsertLengthAlphabet(None) |
| 1044 | copyLengthAlphabet = CopyLengthAlphabet(None) |
| 1045 | |
| 1046 | def __init__(self, number=''): |
| 1047 | super().__init__('IC'+str(number), bitLength=10) |
| 1048 | |
| 1049 | def __len__(self): |
| 1050 | return 704 |
| 1051 | |
| 1052 | def extraBits(self, index): |
| 1053 | insertSymbol, copySymbol, dist0 = self.splitSymbol(index) |
| 1054 | return InsertLengthAlphabet.extraTable[insertSymbol.index] + \ |
| 1055 | CopyLengthAlphabet.extraTable[copySymbol.index] |
| 1056 | |
| 1057 | def splitSymbol(self, index): |
| 1058 | """Give relevant values for computations: |
| 1059 | (insertSymbol, copySymbol, dist0flag) |
| 1060 | """ |
| 1061 | #determine insert and copy upper bits from table |
| 1062 | row = [0,0,1,1,2,2,1,3,2,3,3][index>>6] |
| 1063 | col = [0,1,0,1,0,1,2,0,2,1,2][index>>6] |
| 1064 | #determine inserts and copy sub codes |
| 1065 | insertLengthCode = row<<3 | index>>3&7 |
| 1066 | if row: insertLengthCode -= 8 |
| 1067 | copyLengthCode = col<<3 | index&7 |
| 1068 | return ( |
| 1069 | Symbol(self.insertLengthAlphabet, insertLengthCode), |
| 1070 | Symbol(self.copyLengthAlphabet, copyLengthCode), |
| 1071 | row==0 |
| 1072 | ) |
| 1073 | |
| 1074 | def mnemonic(self, index): |
| 1075 | """Make a nice mnemonic |
| 1076 | """ |
| 1077 | i,c,d0 = self.splitSymbol(index) |
| 1078 | iLower, _ = i.code.span(i.index) |
| 1079 | iExtra = i.extraBits() |
| 1080 | cLower, _ = c.code.span(c.index) |
| 1081 | cExtra = c.extraBits() |
| 1082 | return 'I{}{}{}C{}{}{}{}'.format( |
| 1083 | iLower, |
| 1084 | '+' if iExtra else '', |
| 1085 | 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra), |
| 1086 | cLower, |
| 1087 | '+' if cExtra else '', |
| 1088 | 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra), |
| 1089 | '&D=0' if d0 else '') |
| 1090 | |
| 1091 | def value(self, index, extra): |
| 1092 | i,c,d0 = self.splitSymbol(index) |
| 1093 | iExtra = i.extraBits() |
| 1094 | ce, ie = extra>>iExtra, extra&(1<<iExtra)-1 |
| 1095 | insert = i.value(ie) |
| 1096 | copy = c.value(ce) |
| 1097 | return insert, copy, d0 |
| 1098 | |
| 1099 | def explanation(self, index, extra): |
| 1100 | insert, copy, d0 = self.value(index, extra) |
| 1101 | if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy) |
| 1102 | else: return 'Literal: {}, copy: {}'.format(insert, copy) |
| 1103 | |
| 1104 | class DistanceAlphabet(WithExtra): |
| 1105 | """Represent the distance encoding. |
| 1106 | Dynamically generated alphabet. |
| 1107 | This is what the documentation should have said: |
| 1108 | Ignoring offsets for the moment, the "long" encoding works as follows: |
| 1109 | Write the distance in binary as follows: |
| 1110 | 1xy..yz..z, then the distance symbol consists of n..nxz..z |
| 1111 | Where: |
| 1112 | n is one less than number of bits in y |
| 1113 | x is a single bit |
| 1114 | y..y are n+1 extra bits (encoded in the bit stream) |
| 1115 | z..z is NPOSTFIX bits that are part of the symbol |
| 1116 | The offsets are so as to start at the lowest useable value: |
| 1117 | if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1 |
| 1118 | then n..nxz..z is symbol -NDIRECT-16 |
| 1119 | >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) |
| 1120 | >>> print(d[4], d[17], d[34]) |
| 1121 | last-1 1 10xx00-5 |
| 1122 | >>> [str(d[x]) for x in range(26, 32)] |
| 1123 | ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5'] |
| 1124 | """ |
| 1125 | def __init__(self, number, *, NPOSTFIX, NDIRECT): |
| 1126 | self.NPOSTFIX = NPOSTFIX |
| 1127 | self.NDIRECT = NDIRECT |
| 1128 | #set length |
| 1129 | #Actually, not all symbols are used, |
| 1130 | #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX) |
| 1131 | super().__init__('D'+str(number), |
| 1132 | alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX)) |
| 1133 | |
| 1134 | def extraBits(self, index): |
| 1135 | """Indicate how many extra bits are needed to interpret symbol |
| 1136 | >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) |
| 1137 | >>> [d[i].extraBits() for i in range(26)] |
| 1138 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] |
| 1139 | >>> [d[i].extraBits() for i in range(26,36)] |
| 1140 | [1, 1, 1, 1, 1, 1, 1, 1, 2, 2] |
| 1141 | """ |
| 1142 | if index<16+self.NDIRECT: return 0 |
| 1143 | return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) |
| 1144 | |
| 1145 | def value(self, dcode, dextra): |
| 1146 | """Decode value of symbol together with the extra bits. |
| 1147 | >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) |
| 1148 | >>> d[34].value(2) |
| 1149 | (0, 35) |
| 1150 | """ |
| 1151 | if dcode<16: |
| 1152 | return [(1,0),(2,0),(3,0),(4,0), |
| 1153 | (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3), |
| 1154 | (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3) |
| 1155 | ][dcode] |
| 1156 | if dcode<16+self.NDIRECT: |
| 1157 | return (0,dcode-16) |
| 1158 | #we use the original formulas, instead of my clear explanation |
| 1159 | POSTFIX_MASK = (1 << self.NPOSTFIX) - 1 |
| 1160 | ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) |
| 1161 | hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX |
| 1162 | lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK |
| 1163 | offset = ((2 + (hcode & 1)) << ndistbits) - 4 |
| 1164 | distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1 |
| 1165 | return (0,distance) |
| 1166 | |
| 1167 | def mnemonic(self, index, verbose=False): |
| 1168 | """Give mnemonic representation of meaning. |
| 1169 | verbose compresses strings of x's |
| 1170 | """ |
| 1171 | if index<16: |
| 1172 | return ['last', '2last', '3last', '4last', |
| 1173 | 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3', |
| 1174 | '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3' |
| 1175 | ][index] |
| 1176 | if index<16+self.NDIRECT: |
| 1177 | return str(index-16) |
| 1178 | #construct strings like "1xx01-15" |
| 1179 | index -= self.NDIRECT+16 |
| 1180 | hcode = index >> self.NPOSTFIX |
| 1181 | lcode = index & (1<<self.NPOSTFIX)-1 |
| 1182 | if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}' |
| 1183 | else: formatString = '1{0}{1}{4:+d}' |
| 1184 | return formatString.format( |
| 1185 | hcode&1, |
| 1186 | 'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1), |
| 1187 | lcode, self.NPOSTFIX, |
| 1188 | self.NDIRECT+1-(4<<self.NPOSTFIX)) |
| 1189 | |
| 1190 | def explanation(self, index, extra): |
| 1191 | """ |
| 1192 | >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) |
| 1193 | >>> d[55].explanation(13) |
| 1194 | '11[1101]01-5: [0]+240' |
| 1195 | """ |
| 1196 | extraBits = self.extraBits(index) |
| 1197 | extraString = '[{:0{}b}]'.format(extra, extraBits) |
| 1198 | return '{0}: [{1[0]}]{1[1]:+d}'.format( |
| 1199 | self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString), |
| 1200 | self.value(index, extra)) |
| 1201 | |
| 1202 | #Classes for doing actual work------------------------------------------ |
| 1203 | class ContextModeKeeper: |
| 1204 | """For computing the literal context mode. |
| 1205 | You feed it characters, and it computes indices in the context map. |
| 1206 | """ |
| 1207 | def __init__(self, mode): |
| 1208 | self.chars = deque([0,0], maxlen=2) |
| 1209 | self.mode = mode |
| 1210 | |
| 1211 | def setContextMode(self, mode): |
| 1212 | """Switch to given context mode (0..3)""" |
| 1213 | self.mode = mode |
| 1214 | def getIndex(self): |
| 1215 | if self.mode==0: #LSB6 |
| 1216 | return self.chars[1]&0x3f |
| 1217 | elif self.mode==1: #MSB6 |
| 1218 | return self.chars[1]>>2 |
| 1219 | elif self.mode==2: #UTF8: character class of previous and a bit of the second |
| 1220 | p2,p1 = self.chars |
| 1221 | return self.lut0[p1]|self.lut1[p2] |
| 1222 | elif self.mode==3: #Signed: initial bits of last two bytes |
| 1223 | p2,p1 = self.chars |
| 1224 | return self.lut2[p1]<<3|self.lut2[p2] |
| 1225 | |
| 1226 | def add(self, index): |
| 1227 | """Adjust the context for output char (as int).""" |
| 1228 | self.chars.append(index) |
| 1229 | |
| 1230 | #0: control #16: quote #32: ,:; #48: AEIOU |
| 1231 | #4: tab/lf/cr #20: % #36: . #52: BC..Z |
| 1232 | #8: space #24: (<[{ #40: = #56: aeiou |
| 1233 | #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z |
| 1234 | lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0, |
| 1235 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1236 | 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12, |
| 1237 | 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12, |
| 1238 | 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48, |
| 1239 | 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12, |
| 1240 | 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56, |
| 1241 | 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0 |
| 1242 | ]+[0,1]*32+[2,3]*32 |
| 1243 | #0: space 1:punctuation 2:digit/upper 3:lower |
| 1244 | lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1245 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 1246 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1247 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, |
| 1248 | 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 1249 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, |
| 1250 | 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 1251 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0 |
| 1252 | ]+[0]*96+[2]*32 |
| 1253 | #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1 |
| 1254 | lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7] |
| 1255 | assert len(lut0)==len(lut1)==len(lut2)==256 |
| 1256 | |
| 1257 | class WordList: |
| 1258 | """Word list. |
| 1259 | >>> WordList().word(7, 35555) |
| 1260 | b'Program to ' |
| 1261 | """ |
| 1262 | NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10, |
| 1263 | 10, 10, 10, 9, 9, 8, 7, 7, 8, 7, |
| 1264 | 7, 6, 6, 5, 5] |
| 1265 | def __init__(self): |
| 1266 | self.file = open('dict', 'rb') |
| 1267 | self.compileActions() |
| 1268 | |
| 1269 | def word(self, size, dist): |
| 1270 | """Get word |
| 1271 | """ |
| 1272 | #split dist in index and action |
| 1273 | ndbits = self.NDBITS[size] |
| 1274 | index = dist&(1<<ndbits)-1 |
| 1275 | action = dist>>ndbits |
| 1276 | #compute position in file |
| 1277 | position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index |
| 1278 | self.file.seek(position) |
| 1279 | return self.doAction(self.file.read(size), action) |
| 1280 | |
| 1281 | def upperCase1(self, word): |
| 1282 | word = word.decode('utf8') |
| 1283 | word = word[0].upper()+word[1:] |
| 1284 | return word.encode('utf8') |
| 1285 | |
| 1286 | |
| 1287 | #Super compact form of action table. |
| 1288 | #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst |
| 1289 | actionTable = r""" |
| 1290 | 0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_ |
| 1291 | 1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+. |
| 1292 | 2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w |
| 1293 | 3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+, |
| 1294 | 4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+=" |
| 1295 | 5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+=" |
| 1296 | 6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_ |
| 1297 | 7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_ |
| 1298 | 8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\' |
| 1299 | 9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+, |
| 1300 | 10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+=" |
| 1301 | 11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_ |
| 1302 | 12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+, |
| 1303 | 13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+( |
| 1304 | 14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._ |
| 1305 | 15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+. |
| 1306 | 16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\' |
| 1307 | 17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._ |
| 1308 | 18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+=" |
| 1309 | 19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\' |
| 1310 | 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\' |
| 1311 | 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+. |
| 1312 | 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+"> |
| 1313 | 23:w[:-3] 48:w[:-7] 98:_+w+=\' |
| 1314 | 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+, |
| 1315 | """ |
| 1316 | |
| 1317 | def compileActions(self): |
| 1318 | """Build the action table from the text above |
| 1319 | """ |
| 1320 | import re |
| 1321 | self.actionList = actions = [None]*121 |
| 1322 | #Action 73, which is too long, looks like this when expanded: |
| 1323 | actions[73] = "b' the '+w+b' of the '" |
| 1324 | #find out what the columns are |
| 1325 | actionLines = self.actionTable.splitlines() |
| 1326 | colonPositions = [m.start() |
| 1327 | for m in re.finditer(':',actionLines[1]) |
| 1328 | ]+[100] |
| 1329 | columns = [(colonPositions[i]-3,colonPositions[i+1]-3) |
| 1330 | for i in range(len(colonPositions)-1)] |
| 1331 | for line in self.actionTable.splitlines(keepends=False): |
| 1332 | for start,end in columns: |
| 1333 | action = line[start:end] |
| 1334 | #skip empty actions |
| 1335 | if not action or action.isspace(): continue |
| 1336 | #chop it up, and check if the colon is properly placed |
| 1337 | index, colon, action = action[:3], action[3], action[4:] |
| 1338 | assert colon==':' |
| 1339 | #remove filler spaces at right |
| 1340 | action = action.rstrip() |
| 1341 | #replace space symbols |
| 1342 | action = action.replace('_', ' ') |
| 1343 | wPos = action.index('w') |
| 1344 | #add quotes around left string when present |
| 1345 | #translation: any pattern from beginning, up to |
| 1346 | #(but not including) a + following by a w later on |
| 1347 | action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action) |
| 1348 | #add quotes around right string when present |
| 1349 | #translation: anything with a w in it, followed by a + |
| 1350 | #and a pattern up to the end |
| 1351 | #(there is no variable lookbehind assertion, |
| 1352 | #so we have to copy the pattern) |
| 1353 | action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action) |
| 1354 | #expand shortcut for uppercaseAll |
| 1355 | action = action.replace(".U", ".upper()") |
| 1356 | #store action |
| 1357 | actions[int(index)] = action |
| 1358 | |
| 1359 | def doAction(self, w, action): |
| 1360 | """Perform the proper action |
| 1361 | """ |
| 1362 | #set environment for the UpperCaseFirst |
| 1363 | U = self.upperCase1 |
| 1364 | return eval(self.actionList[action], locals()) |
| 1365 | |
| 1366 | class Layout: |
| 1367 | """Class to layout the output. |
| 1368 | """ |
| 1369 | #display width of hexdata+bitdata |
| 1370 | width = 25 |
| 1371 | #general |
| 1372 | def __init__(self, stream): |
| 1373 | self.stream = stream |
| 1374 | self.bitPtr = self.width |
| 1375 | |
| 1376 | def makeHexData(self, pos): |
| 1377 | """Produce hex dump of all data containing the bits |
| 1378 | from pos to stream.pos |
| 1379 | """ |
| 1380 | firstAddress = pos+7>>3 |
| 1381 | lastAddress = self.stream.pos+7>>3 |
| 1382 | return ''.join(map('{:02x} '.format, |
| 1383 | self.stream.data[firstAddress:lastAddress])) |
| 1384 | |
| 1385 | def formatBitData(self, pos, width1, width2=0): |
| 1386 | """Show formatted bit data: |
| 1387 | Bytes are separated by commas |
| 1388 | whole bytes are displayed in hex |
| 1389 | >>> Layout(olleke).formatBitData(6, 2, 16) |
| 1390 | '|00h|2Eh,|00' |
| 1391 | >>> Layout(olleke).formatBitData(4, 1, 0) |
| 1392 | '1' |
| 1393 | """ |
| 1394 | result = [] |
| 1395 | #make empty prefix code explicit |
| 1396 | if width1==0: result = ['()', ','] |
| 1397 | for width in width1, width2: |
| 1398 | #skip empty width2 |
| 1399 | if width==0: continue |
| 1400 | #build result backwards in a list |
| 1401 | while width>0: |
| 1402 | availableBits = 8-(pos&7) |
| 1403 | if width<availableBits: |
| 1404 | #read partial byte, beginning nor ending at boundary |
| 1405 | data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1 |
| 1406 | result.append('{:0{}b}'.format(data, width)) |
| 1407 | elif availableBits<8: |
| 1408 | #read rest of byte, ending at boundary |
| 1409 | data = self.stream.data[pos>>3] >> (pos&7) |
| 1410 | result.append('|{:0{}b}'.format(data, availableBits)) |
| 1411 | else: |
| 1412 | #read whole byte (in hex), beginning and ending at boundary |
| 1413 | data = self.stream.data[pos>>3] |
| 1414 | result.append('|{:02X}h'.format(data)) |
| 1415 | width -= availableBits |
| 1416 | pos += availableBits |
| 1417 | #if width overshot from the availableBits subtraction, fix it |
| 1418 | pos += width |
| 1419 | #add comma to separate fields |
| 1420 | result.append(',') |
| 1421 | #concatenate pieces, reversed, skipping the last space |
| 1422 | return ''.join(result[-2::-1]) |
| 1423 | |
| 1424 | def readPrefixCode(self, alphabet): |
| 1425 | """give alphabet the prefix code that is read from the stream |
| 1426 | Called for the following alphabets, in this order: |
| 1427 | The alphabet in question must have a "logical" order, |
| 1428 | otherwise the assignment of symbols doesn't work. |
| 1429 | """ |
| 1430 | mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name)) |
| 1431 | if mode=='Complex': |
| 1432 | #for a complex code, numberOfSymbols means hskip |
| 1433 | self.readComplexCode(numberOfSymbols, alphabet) |
| 1434 | return alphabet |
| 1435 | else: |
| 1436 | table = [] |
| 1437 | #Set table of lengths for mnemonic function |
| 1438 | lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1] |
| 1439 | #adjust mnemonic function of alphabet class |
| 1440 | def myMnemonic(index): |
| 1441 | return '{} bit{}: {}'.format( |
| 1442 | lengths[i], |
| 1443 | '' if lengths[i]==1 else 's', |
| 1444 | alphabet.__class__.mnemonic(alphabet, index) |
| 1445 | ) |
| 1446 | alphabet.mnemonic = myMnemonic |
| 1447 | for i in range(numberOfSymbols): |
| 1448 | table.append(self.verboseRead(alphabet, skipExtra=True).index) |
| 1449 | #restore mnemonic |
| 1450 | del alphabet.mnemonic |
| 1451 | if numberOfSymbols==4: |
| 1452 | #read tree shape to redefine lengths |
| 1453 | lengths = self.verboseRead(TreeShapeAlhabet()) |
| 1454 | #construct the alphabet prefix code |
| 1455 | alphabet.setLength(dict(zip(table, lengths))) |
| 1456 | return alphabet |
| 1457 | |
| 1458 | def readComplexCode(self, hskip, alphabet): |
| 1459 | """Read complex code""" |
| 1460 | stream = self.stream |
| 1461 | #read the lengths for the length code |
| 1462 | lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:] |
| 1463 | codeLengths = {} |
| 1464 | total = 0 |
| 1465 | lol = LengthOfLengthAlphabet('##'+alphabet.name) |
| 1466 | #lengthCode will be used for coding the lengths of the new code |
| 1467 | #we use it for display until now; definition comes below |
| 1468 | lengthCode = LengthAlphabet('#'+alphabet.name) |
| 1469 | lengthIter = iter(lengths) |
| 1470 | while total<32: |
| 1471 | newSymbol = next(lengthIter) |
| 1472 | lol.description = str(lengthCode[newSymbol]) |
| 1473 | length = self.verboseRead(lol) |
| 1474 | if length: |
| 1475 | codeLengths[newSymbol] = length |
| 1476 | total += 32>>length |
| 1477 | if total>=32: |
| 1478 | break |
| 1479 | if total>32: raise ValueError("Stream format") |
| 1480 | #Now set the encoding of the lengthCode |
| 1481 | lengthCode.setLength(codeLengths) |
| 1482 | print("***** Lengths for {} will be coded as:".format(alphabet.name)) |
| 1483 | lengthCode.showCode() |
| 1484 | #Now determine the symbol lengths with the lengthCode |
| 1485 | symbolLengths = {} |
| 1486 | total = 0 |
| 1487 | lastLength = 8 |
| 1488 | alphabetIter = iter(alphabet) |
| 1489 | while total<32768: |
| 1490 | #look ahead to see what is going to happen |
| 1491 | length = lengthCode.decodePeek( |
| 1492 | self.stream.peek(lengthCode.maxLength))[1].index |
| 1493 | #in every branch, set lengthCode.description to explanatory text |
| 1494 | #lengthCode calls format(symbol, extra) with this string |
| 1495 | if length==0: |
| 1496 | symbol = next(alphabetIter) |
| 1497 | lengthCode.description = 'symbol {} unused'.format(symbol) |
| 1498 | self.verboseRead(lengthCode) |
| 1499 | #unused symbol |
| 1500 | continue |
| 1501 | if length==16: |
| 1502 | lengthCode.description = \ |
| 1503 | '{1}+3 symbols of length '+str(lastLength) |
| 1504 | extra = self.verboseRead(lengthCode) |
| 1505 | #scan series of 16s (repeat counts) |
| 1506 | #start with repeat count 2 |
| 1507 | repeat = 2 |
| 1508 | startSymbol = next(alphabetIter) |
| 1509 | endSymbol = next(alphabetIter) |
| 1510 | symbolLengths[startSymbol.index] = \ |
| 1511 | symbolLengths[endSymbol.index] = lastLength |
| 1512 | #count the two just defined symbols |
| 1513 | total += 2*32768>>lastLength |
| 1514 | #note: loop may end because we're there |
| 1515 | #even if a 16 _appears_ to follow |
| 1516 | while True: |
| 1517 | #determine last symbol |
| 1518 | oldRepeat = repeat |
| 1519 | repeat = (repeat-2<<2)+extra+3 |
| 1520 | #read as many symbols as repeat increased |
| 1521 | for i in range(oldRepeat, repeat): |
| 1522 | endSymbol = next(alphabetIter) |
| 1523 | symbolLengths[endSymbol.index] = lastLength |
| 1524 | #compute new total; it may be end of loop |
| 1525 | total += (repeat-oldRepeat)*32768>>lastLength |
| 1526 | if total>=32768: break |
| 1527 | #see if there is more to do |
| 1528 | length = lengthCode.decodePeek( |
| 1529 | self.stream.peek(lengthCode.maxLength))[1].index |
| 1530 | if length!=16: break |
| 1531 | lengthCode.description = 'total {}+{{1}} symbols'.format( |
| 1532 | (repeat-2<<2)+3) |
| 1533 | extra = self.verboseRead(lengthCode) |
| 1534 | elif length==17: |
| 1535 | #read, and show explanation |
| 1536 | lengthCode.description = '{1}+3 unused' |
| 1537 | extra = self.verboseRead(lengthCode) |
| 1538 | #scan series of 17s (groups of zero counts) |
| 1539 | #start with repeat count 2 |
| 1540 | repeat = 2 |
| 1541 | startSymbol = next(alphabetIter) |
| 1542 | endSymbol = next(alphabetIter) |
| 1543 | #note: loop will not end with total==32768, |
| 1544 | #since total doesn't change here |
| 1545 | while True: |
| 1546 | #determine last symbol |
| 1547 | oldRepeat = repeat |
| 1548 | repeat = (repeat-2<<3)+extra+3 |
| 1549 | #read as many symbols as repeat increases |
| 1550 | for i in range(repeat-oldRepeat): |
| 1551 | endSymbol = next(alphabetIter) |
| 1552 | #see if there is more to do |
| 1553 | length = lengthCode.decodePeek( |
| 1554 | self.stream.peek(lengthCode.maxLength))[1].index |
| 1555 | if length!=17: break |
| 1556 | lengthCode.description = 'total {}+{{1}} unused'.format( |
| 1557 | (repeat-2<<3)+3) |
| 1558 | extra = self.verboseRead(lengthCode) |
| 1559 | else: |
| 1560 | symbol = next(alphabetIter) |
| 1561 | #double braces for format |
| 1562 | char = str(symbol) |
| 1563 | if char in '{}': char *= 2 |
| 1564 | lengthCode.description = \ |
| 1565 | 'Length for {} is {{0.index}} bits'.format(char) |
| 1566 | #output is not needed (will be 0) |
| 1567 | self.verboseRead(lengthCode) |
| 1568 | symbolLengths[symbol.index] = length |
| 1569 | total += 32768>>length |
| 1570 | lastLength = length |
| 1571 | assert total==32768 |
| 1572 | alphabet.setLength(symbolLengths) |
| 1573 | print('End of table. Prefix code '+alphabet.name+':') |
| 1574 | alphabet.showCode() |
| 1575 | |
| 1576 | #stream |
| 1577 | def processStream(self): |
| 1578 | """Process a brotli stream. |
| 1579 | """ |
| 1580 | print('addr hex{:{}s}binary context explanation'.format( |
| 1581 | '', self.width-10)) |
| 1582 | print('Stream header'.center(60, '-')) |
| 1583 | self.windowSize = self.verboseRead(WindowSizeAlphabet()) |
| 1584 | print('Metablock header'.center(60, '=')) |
| 1585 | self.ISLAST = False |
| 1586 | self.output = bytearray() |
| 1587 | while not self.ISLAST: |
| 1588 | self.ISLAST = self.verboseRead( |
| 1589 | BoolCode('LAST', description="Last block")) |
| 1590 | if self.ISLAST: |
| 1591 | if self.verboseRead( |
| 1592 | BoolCode('EMPTY', description="Empty block")): break |
| 1593 | if self.metablockLength(): continue |
| 1594 | if not self.ISLAST and self.uncompressed(): continue |
| 1595 | print('Block type descriptors'.center(60, '-')) |
| 1596 | self.numberOfBlockTypes = {} |
| 1597 | self.currentBlockCounts = {} |
| 1598 | self.blockTypeCodes = {} |
| 1599 | self.blockCountCodes = {} |
| 1600 | for blockType in (L,I,D): self.blockType(blockType) |
| 1601 | print('Distance code parameters'.center(60, '-')) |
| 1602 | self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet()) |
| 1603 | self.readLiteralContextModes() |
| 1604 | print('Context maps'.center(60, '-')) |
| 1605 | self.cmaps = {} |
| 1606 | #keep the number of each kind of prefix tree for the last loop |
| 1607 | numberOfTrees = {I: self.numberOfBlockTypes[I]} |
| 1608 | for blockType in (L,D): |
| 1609 | numberOfTrees[blockType] = self.contextMap(blockType) |
| 1610 | print('Prefix code lists'.center(60, '-')) |
| 1611 | self.prefixCodes = {} |
| 1612 | for blockType in (L,I,D): |
| 1613 | self.readPrefixArray(blockType, numberOfTrees[blockType]) |
| 1614 | self.metablock() |
| 1615 | |
| 1616 | #metablock header |
| 1617 | def verboseRead(self, alphabet, context='', skipExtra=False): |
| 1618 | """Read symbol and extra from stream and explain what happens. |
| 1619 | Returns the value of the symbol |
| 1620 | >>> olleke.pos = 0 |
| 1621 | >>> l = Layout(olleke) |
| 1622 | >>> l.verboseRead(WindowSizeAlphabet()) |
| 1623 | 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| 1624 | 4194288 |
| 1625 | """ |
| 1626 | #TODO 2: verbosity level, e.g. show only codes and maps in header |
| 1627 | stream = self.stream |
| 1628 | pos = stream.pos |
| 1629 | if skipExtra: |
| 1630 | length, symbol = alphabet.readTuple(stream) |
| 1631 | extraBits, extra = 0, None |
| 1632 | else: |
| 1633 | length, symbol, extraBits, extra = alphabet.readTupleAndExtra( |
| 1634 | stream) |
| 1635 | #fields: address, hex data, binary data, name of alphabet, explanation |
| 1636 | hexdata = self.makeHexData(pos) |
| 1637 | addressField = '{:04x}'.format(pos+7>>3) if hexdata else '' |
| 1638 | bitdata = self.formatBitData(pos, length, extraBits) |
| 1639 | #bitPtr moves bitdata so that the bytes are easier to read |
| 1640 | #jump back to right if a new byte starts |
| 1641 | if '|' in bitdata[1:]: |
| 1642 | #start over on the right side |
| 1643 | self.bitPtr = self.width |
| 1644 | fillWidth = self.bitPtr-(len(hexdata)+len(bitdata)) |
| 1645 | if fillWidth<0: fillWidth = 0 |
| 1646 | print('{:<5s} {:<{}s} {:7s} {}'.format( |
| 1647 | addressField, |
| 1648 | hexdata+' '*fillWidth+bitdata, self.width, |
| 1649 | context+alphabet.name, |
| 1650 | symbol if skipExtra else symbol.explanation(extra), |
| 1651 | )) |
| 1652 | #jump to the right if we started with a '|' |
| 1653 | #because we didn't jump before printing |
| 1654 | if bitdata.startswith('|'): self.bitPtr = self.width |
| 1655 | else: self.bitPtr -= len(bitdata) |
| 1656 | return symbol if skipExtra else symbol.value(extra) |
| 1657 | |
| 1658 | def metablockLength(self): |
| 1659 | """Read MNIBBLES and meta block length; |
| 1660 | if empty block, skip block and return true. |
| 1661 | """ |
| 1662 | self.MLEN = self.verboseRead(MetablockLengthAlphabet()) |
| 1663 | if self.MLEN: |
| 1664 | return False |
| 1665 | #empty block; skip and return False |
| 1666 | self.verboseRead(ReservedAlphabet()) |
| 1667 | MSKIP = self.verboseRead(SkipLengthAlphabet()) |
| 1668 | self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) |
| 1669 | self.stream.pos += 8*MSKIP |
| 1670 | print("Skipping to {:x}".format(self.stream.pos>>3)) |
| 1671 | return True |
| 1672 | |
| 1673 | def uncompressed(self): |
| 1674 | """If true, handle uncompressed data |
| 1675 | """ |
| 1676 | ISUNCOMPRESSED = self.verboseRead( |
| 1677 | BoolCode('UNCMPR', description='Is uncompressed?')) |
| 1678 | if ISUNCOMPRESSED: |
| 1679 | self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) |
| 1680 | print('Uncompressed data:') |
| 1681 | self.output += self.stream.readBytes(self.MLEN) |
| 1682 | print(outputFormatter(self.output[-self.MLEN:])) |
| 1683 | return ISUNCOMPRESSED |
| 1684 | |
| 1685 | def blockType(self, kind): |
| 1686 | """Read block type switch descriptor for given kind of blockType.""" |
| 1687 | NBLTYPES = self.verboseRead(TypeCountAlphabet( |
| 1688 | 'BT#'+kind[0].upper(), |
| 1689 | description='{} block types'.format(kind), |
| 1690 | )) |
| 1691 | self.numberOfBlockTypes[kind] = NBLTYPES |
| 1692 | if NBLTYPES>=2: |
| 1693 | self.blockTypeCodes[kind] = self.readPrefixCode( |
| 1694 | BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES)) |
| 1695 | self.blockCountCodes[kind] = self.readPrefixCode( |
| 1696 | BlockCountAlphabet('BC'+kind[0].upper())) |
| 1697 | blockCount = self.verboseRead(self.blockCountCodes[kind]) |
| 1698 | else: |
| 1699 | blockCount = 1<<24 |
| 1700 | self.currentBlockCounts[kind] = blockCount |
| 1701 | |
| 1702 | def readLiteralContextModes(self): |
| 1703 | """Read literal context modes. |
| 1704 | LSB6: lower 6 bits of last char |
| 1705 | MSB6: upper 6 bits of last char |
| 1706 | UTF8: rougly dependent on categories: |
| 1707 | upper 4 bits depend on category of last char: |
| 1708 | control/whitespace/space/ punctuation/quote/%/open/close/ |
| 1709 | comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant |
| 1710 | lower 2 bits depend on category of 2nd last char: |
| 1711 | space/punctuation/digit or upper/lowercase |
| 1712 | signed: hamming weight of last 2 chars |
| 1713 | """ |
| 1714 | print('Context modes'.center(60, '-')) |
| 1715 | self.literalContextModes = [] |
| 1716 | for i in range(self.numberOfBlockTypes[L]): |
| 1717 | self.literalContextModes.append( |
| 1718 | self.verboseRead(LiteralContextMode(number=i))) |
| 1719 | |
| 1720 | def contextMap(self, kind): |
| 1721 | """Read context maps |
| 1722 | Returns the number of differnt values on the context map |
| 1723 | (In other words, the number of prefix trees) |
| 1724 | """ |
| 1725 | NTREES = self.verboseRead(TypeCountAlphabet( |
| 1726 | kind[0].upper()+'T#', |
| 1727 | description='{} prefix trees'.format(kind))) |
| 1728 | mapSize = {L:64, D:4}[kind] |
| 1729 | if NTREES<2: |
| 1730 | self.cmaps[kind] = [0]*mapSize |
| 1731 | else: |
| 1732 | #read CMAPkind |
| 1733 | RLEMAX = self.verboseRead(RLEmaxAlphabet( |
| 1734 | 'RLE#'+kind[0].upper(), |
| 1735 | description=kind+' context map')) |
| 1736 | alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX) |
| 1737 | cmapCode = self.readPrefixCode(alphabet) |
| 1738 | tableSize = mapSize*self.numberOfBlockTypes[kind] |
| 1739 | cmap = [] |
| 1740 | while len(cmap)<tableSize: |
| 1741 | cmapCode.description = 'map {}, entry {}'.format( |
| 1742 | *divmod(len(cmap), mapSize)) |
| 1743 | count, value = self.verboseRead(cmapCode) |
| 1744 | cmap.extend([value]*count) |
| 1745 | assert len(cmap)==tableSize |
| 1746 | IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF')) |
| 1747 | if IMTF: |
| 1748 | self.IMTF(cmap) |
| 1749 | if kind==L: |
| 1750 | print('Context maps for literal data:') |
| 1751 | for i in range(0, len(cmap), 64): |
| 1752 | print(*( |
| 1753 | ''.join(map(str, cmap[j:j+8])) |
| 1754 | for j in range(i, i+64, 8) |
| 1755 | )) |
| 1756 | else: |
| 1757 | print('Context map for distances:') |
| 1758 | print(*( |
| 1759 | ''.join(map('{:x}'.format, cmap[i:i+4])) |
| 1760 | for i in range(0, len(cmap), 4) |
| 1761 | )) |
| 1762 | self.cmaps[kind] = cmap |
| 1763 | return NTREES |
| 1764 | |
| 1765 | @staticmethod |
| 1766 | def IMTF(v): |
| 1767 | """In place inverse move to front transform. |
| 1768 | """ |
| 1769 | #mtf is initialized virtually with range(infinity) |
| 1770 | mtf = [] |
| 1771 | for i, vi in enumerate(v): |
| 1772 | #get old value from mtf. If never seen, take virtual value |
| 1773 | try: value = mtf.pop(vi) |
| 1774 | except IndexError: value = vi |
| 1775 | #put value at front |
| 1776 | mtf.insert(0, value) |
| 1777 | #replace transformed value |
| 1778 | v[i] = value |
| 1779 | |
| 1780 | def readPrefixArray(self, kind, numberOfTrees): |
| 1781 | """Read prefix code array""" |
| 1782 | prefixes = [] |
| 1783 | for i in range(numberOfTrees): |
| 1784 | if kind==L: alphabet = LiteralAlphabet(i) |
| 1785 | elif kind==I: alphabet = InsertAndCopyAlphabet(i) |
| 1786 | elif kind==D: alphabet = DistanceAlphabet( |
| 1787 | i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT) |
| 1788 | self.readPrefixCode(alphabet) |
| 1789 | prefixes.append(alphabet) |
| 1790 | self.prefixCodes[kind] = prefixes |
| 1791 | |
| 1792 | #metablock data |
| 1793 | def metablock(self): |
| 1794 | """Process the data. |
| 1795 | Relevant variables of self: |
| 1796 | numberOfBlockTypes[kind]: number of block types |
| 1797 | currentBlockTypes[kind]: current block types (=0) |
| 1798 | literalContextModes: the context modes for the literal block types |
| 1799 | currentBlockCounts[kind]: counters for block types |
| 1800 | blockTypeCodes[kind]: code for block type |
| 1801 | blockCountCodes[kind]: code for block count |
| 1802 | cmaps[kind]: the context maps (not for I) |
| 1803 | prefixCodes[kind][#]: the prefix codes |
| 1804 | lastDistances: the last four distances |
| 1805 | lastChars: the last two chars |
| 1806 | output: the result |
| 1807 | """ |
| 1808 | print('Meta block contents'.center(60, '=')) |
| 1809 | self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1} |
| 1810 | self.lastDistances = deque([17,16,11,4], maxlen=4) |
| 1811 | #the current context mode is for block type 0 |
| 1812 | self.contextMode = ContextModeKeeper(self.literalContextModes[0]) |
| 1813 | wordList = WordList() |
| 1814 | |
| 1815 | #setup distance callback function |
| 1816 | def distanceCallback(symbol, extra): |
| 1817 | "callback function for displaying decoded distance" |
| 1818 | index, offset = symbol.value(extra) |
| 1819 | if index: |
| 1820 | #recent distance |
| 1821 | distance = self.lastDistances[-index]+offset |
| 1822 | return 'Distance: {}last{:+d}={}'.format(index, offset, distance) |
| 1823 | #absolute value |
| 1824 | if offset<=maxDistance: |
| 1825 | return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset) |
| 1826 | #word list value |
| 1827 | action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen]) |
| 1828 | return '{}-{} gives word {},{} action {}'.format( |
| 1829 | offset, maxDistance, copyLen, word, action) |
| 1830 | for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback |
| 1831 | |
| 1832 | blockLen = 0 |
| 1833 | #there we go |
| 1834 | while blockLen<self.MLEN: |
| 1835 | #get insert© command |
| 1836 | litLen, copyLen, dist0Flag = self.verboseRead( |
| 1837 | self.prefixCodes[I][ |
| 1838 | self.figureBlockType(I)]) |
| 1839 | #literal data |
| 1840 | for i in range(litLen): |
| 1841 | bt = self.figureBlockType(L) |
| 1842 | cm = self.contextMode.getIndex() |
| 1843 | ct = self.cmaps[L][bt<<6|cm] |
| 1844 | char = self.verboseRead( |
| 1845 | self.prefixCodes[L][ct], |
| 1846 | context='{},{}='.format(bt,cm)) |
| 1847 | self.contextMode.add(char) |
| 1848 | self.output.append(char) |
| 1849 | blockLen += litLen |
| 1850 | #check if we're done |
| 1851 | if blockLen>=self.MLEN: return |
| 1852 | #distance |
| 1853 | #distances are computed relative to output length, at most window size |
| 1854 | maxDistance = min(len(self.output), self.windowSize) |
| 1855 | if dist0Flag: |
| 1856 | distance = self.lastDistances[-1] |
| 1857 | else: |
| 1858 | bt = self.figureBlockType(D) |
| 1859 | cm = {2:0, 3:1, 4:2}.get(copyLen, 3) |
| 1860 | ct = self.cmaps[D][bt<<2|cm] |
| 1861 | index, offset = self.verboseRead( |
| 1862 | self.prefixCodes[D][ct], |
| 1863 | context='{},{}='.format(bt,cm)) |
| 1864 | distance = self.lastDistances[-index]+offset if index else offset |
| 1865 | if index==1 and offset==0: |
| 1866 | #to make sure distance is not put in last distance list |
| 1867 | dist0Flag = True |
| 1868 | if distance<=maxDistance: |
| 1869 | #copy from output |
| 1870 | for i in range( |
| 1871 | maxDistance-distance, |
| 1872 | maxDistance-distance+copyLen): |
| 1873 | self.output.append(self.output[i]) |
| 1874 | if not dist0Flag: self.lastDistances.append(distance) |
| 1875 | comment = 'Seen before' |
| 1876 | else: |
| 1877 | #fetch from wordlist |
| 1878 | newWord = wordList.word(copyLen, distance-maxDistance-1) |
| 1879 | self.output.extend(newWord) |
| 1880 | #adjust copyLen to reflect actual new data |
| 1881 | copyLen = len(newWord) |
| 1882 | comment = 'From wordlist' |
| 1883 | blockLen += copyLen |
| 1884 | print(' '*40, |
| 1885 | comment, |
| 1886 | ': "', |
| 1887 | outputFormatter(self.output[-copyLen:]), |
| 1888 | '"', |
| 1889 | sep='') |
| 1890 | self.contextMode.add(self.output[-2]) |
| 1891 | self.contextMode.add(self.output[-1]) |
| 1892 | |
| 1893 | def figureBlockType(self, kind): |
| 1894 | counts, types = self.currentBlockCounts, self.currentBlockTypes |
| 1895 | if counts[kind]==0: |
| 1896 | newType = self.verboseRead(self.blockTypeCodes[kind]) |
| 1897 | if newType==-2: newType = types['P'+kind] |
| 1898 | elif newType==-1: |
| 1899 | newType = (types[kind]+1)%self.numberOfBlockTypes[kind] |
| 1900 | types['P'+kind] = types[kind] |
| 1901 | types[kind] = newType |
| 1902 | counts[kind] = self.verboseRead(self.blockCountCodes[kind]) |
| 1903 | counts[kind] -=1 |
| 1904 | return types[kind] |
| 1905 | |
| 1906 | __test__ = { |
| 1907 | 'BitStream': """ |
| 1908 | >>> bs = BitStream(b'Jurjen') |
| 1909 | >>> bs.readBytes(2) |
| 1910 | b'Ju' |
| 1911 | >>> bs.read(6) #r=01110010 |
| 1912 | 50 |
| 1913 | >>> bs |
| 1914 | BitStream(pos=2:6) |
| 1915 | >>> bs.peek(5) #j=01101010 |
| 1916 | 9 |
| 1917 | >>> bs.readBytes(2) |
| 1918 | Traceback (most recent call last): |
| 1919 | ... |
| 1920 | ValueError: readBytes: need byte boundary |
| 1921 | """, |
| 1922 | |
| 1923 | 'Symbol': """ |
| 1924 | >>> a=Symbol(MetablockLengthAlphabet(),5) |
| 1925 | >>> len(a) |
| 1926 | 2 |
| 1927 | >>> int(a) |
| 1928 | 5 |
| 1929 | >>> a.bitPattern() |
| 1930 | '01' |
| 1931 | >>> a.value(200000) |
| 1932 | 200001 |
| 1933 | >>> a.explanation(300000) |
| 1934 | 'data length: 493e0h+1=300001' |
| 1935 | """, |
| 1936 | |
| 1937 | 'RangeDecoder': """ |
| 1938 | >>> a=RangeDecoder(bitLength=3) |
| 1939 | >>> len(a) |
| 1940 | 8 |
| 1941 | >>> a.name='t' |
| 1942 | >>> list(a) |
| 1943 | [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)] |
| 1944 | >>> a[2] |
| 1945 | Symbol(t, 2) |
| 1946 | >>> a.bitPattern(4) |
| 1947 | '100' |
| 1948 | >>> a.length(2) |
| 1949 | 3 |
| 1950 | >>> a.decodePeek(15) |
| 1951 | (3, Symbol(t, 7)) |
| 1952 | >>> |
| 1953 | |
| 1954 | """, |
| 1955 | |
| 1956 | 'PrefixDecoder': """ |
| 1957 | >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4}) |
| 1958 | >>> len(a) |
| 1959 | 4 |
| 1960 | >>> a.name='t' |
| 1961 | >>> list(a) |
| 1962 | [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)] |
| 1963 | >>> a.decodePeek(22) |
| 1964 | (1, Symbol(t, 1)) |
| 1965 | >>> a.decodePeek(27) |
| 1966 | (3, Symbol(t, 3)) |
| 1967 | >>> a.length(1) |
| 1968 | 1 |
| 1969 | >>> a.length(4) |
| 1970 | 3 |
| 1971 | """, |
| 1972 | |
| 1973 | 'Code': """ |
| 1974 | >>> a=Code('t',alphabetSize=10) |
| 1975 | >>> len(a) |
| 1976 | 10 |
| 1977 | >>> a.showCode() |
| 1978 | 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9 |
| 1979 | >>> a.setLength({2:1,3:2,5:3,6:3}) |
| 1980 | >>> a.showCode() |
| 1981 | 0:2 01:3 011:5 111:6 |
| 1982 | >>> len(a) |
| 1983 | 4 |
| 1984 | >>> def callback(i): return 'call{}back'.format(i) |
| 1985 | >>> a=Code('t',callback=callback,bitLength=3) |
| 1986 | >>> a[6].explanation() |
| 1987 | 'call6back' |
| 1988 | """, |
| 1989 | |
| 1990 | 'WithExtra': """ |
| 1991 | >>> class A(WithExtra): |
| 1992 | ... extraTable = [0,1,1,2,2] |
| 1993 | >>> a=A('t',alphabetSize=5) |
| 1994 | >>> a[1] |
| 1995 | Symbol(t, 1) |
| 1996 | >>> a.extraBits(2) |
| 1997 | 1 |
| 1998 | >>> a.mnemonic(4) |
| 1999 | '4' |
| 2000 | >>> a.readTupleAndExtra(BitStream(b'\x5b')) |
| 2001 | (3, Symbol(t, 3), 2, 3) |
| 2002 | """, |
| 2003 | |
| 2004 | 'BoolCode': """ |
| 2005 | >>> BoolCode('test')[0].explanation() |
| 2006 | '0: False' |
| 2007 | """, |
| 2008 | |
| 2009 | 'Enumerator': """ |
| 2010 | >>> class A(Enumerator): |
| 2011 | ... extraTable = [0,1,1,2,2] |
| 2012 | ... value0=3 |
| 2013 | >>> a=A(alphabetLength=5) |
| 2014 | >>> a.value(3) |
| 2015 | Traceback (most recent call last): |
| 2016 | ... |
| 2017 | TypeError: value() missing 1 required positional argument: 'extra' |
| 2018 | >>> a.explanation(3,4) |
| 2019 | 'xx 011: 8-11; 8+4=12' |
| 2020 | """, |
| 2021 | |
| 2022 | 'WindowSizeAlphabet': """ |
| 2023 | >>> windowSizeAlphabet = WindowSizeAlphabet() |
| 2024 | >>> windowSizeAlphabet[0] |
| 2025 | Traceback (most recent call last): |
| 2026 | ... |
| 2027 | ValueError: No symbol WindowSizeAlphabet[0] |
| 2028 | >>> len(windowSizeAlphabet) |
| 2029 | 16 |
| 2030 | >>> windowSizeAlphabet[21] |
| 2031 | Symbol(WSIZE, 21) |
| 2032 | >>> windowSizeAlphabet[21].bitPattern() |
| 2033 | '1001' |
| 2034 | >>> windowSizeAlphabet[21].extraBits() |
| 2035 | 0 |
| 2036 | >>> windowSizeAlphabet[21].index |
| 2037 | 21 |
| 2038 | >>> windowSizeAlphabet[10].value() |
| 2039 | 1008 |
| 2040 | >>> windowSizeAlphabet[10].explanation() |
| 2041 | 'windowsize=(1<<10)-16=1008' |
| 2042 | >>> windowSizeAlphabet.showCode() |
| 2043 | 0:65520 1100001:16368 1110001:32752 0011:262128 |
| 2044 | 0000001:131056 0010001:None 1001:2097136 1011:4194288 |
| 2045 | 1000001:4080 1010001:8176 0101:524272 0111:1048560 |
| 2046 | 0100001:1008 0110001:2032 1101:8388592 1111:16777200 |
| 2047 | """, |
| 2048 | |
| 2049 | 'TypeCountAlphabet': """ |
| 2050 | >>> typeCountAlphabet = TypeCountAlphabet(description='bananas') |
| 2051 | >>> len(typeCountAlphabet) |
| 2052 | 9 |
| 2053 | >>> typeCountAlphabet[3] |
| 2054 | Symbol(BT#, 3) |
| 2055 | >>> typeCountAlphabet[9] |
| 2056 | Traceback (most recent call last): |
| 2057 | ... |
| 2058 | ValueError: No symbol TypeCountAlphabet[9] |
| 2059 | >>> print(typeCountAlphabet[3]) |
| 2060 | xx,0101 |
| 2061 | >>> typeCountAlphabet[8].value(127) |
| 2062 | 256 |
| 2063 | >>> typeCountAlphabet[4].explanation(2) |
| 2064 | 'xxx,0111: 11 bananas' |
| 2065 | >>> typeCountAlphabet[0].explanation() |
| 2066 | '0: 1 banana' |
| 2067 | """, |
| 2068 | |
| 2069 | 'DistanceParamAlphabet': """ |
| 2070 | >>> dpa = DistanceParamAlphabet() |
| 2071 | >>> dpa.showCode() |
| 2072 | 00:PF0 01:PF1 10:PF2 11:PF3 |
| 2073 | >>> dpa.readTupleAndExtra(BitStream(b'\\x29')) |
| 2074 | (2, Symbol(DIST, 1), 4, 10) |
| 2075 | >>> dpa.explanation(2, 5) |
| 2076 | '2 postfix bits and 0101<<2=20 direct codes' |
| 2077 | """, |
| 2078 | |
| 2079 | 'LiteralAlphabet': """ |
| 2080 | >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS |
| 2081 | 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0 |
| 2082 | 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1 |
| 2083 | 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2 |
| 2084 | ... |
| 2085 | 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff |
| 2086 | 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc |
| 2087 | 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd |
| 2088 | 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce |
| 2089 | 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf |
| 2090 | """, |
| 2091 | |
| 2092 | 'BlockCountAlphabet': """ |
| 2093 | >>> bc=BlockCountAlphabet('BCL') |
| 2094 | >>> len(bc) |
| 2095 | 26 |
| 2096 | >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02') |
| 2097 | >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) |
| 2098 | 'Block count: xx 00000: 1-4; 1+2=3' |
| 2099 | >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) |
| 2100 | 'Block count: xxx 00110: 33-40; 33+0=33' |
| 2101 | >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) |
| 2102 | 'Block count: xxxxxx 10001: 305-368; 305+28=333' |
| 2103 | >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) |
| 2104 | 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333' |
| 2105 | """, |
| 2106 | |
| 2107 | 'Layout': """ |
| 2108 | >>> olleke.pos = 0 |
| 2109 | >>> l = Layout(olleke) |
| 2110 | >>> l.verboseRead(WindowSizeAlphabet()) |
| 2111 | 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| 2112 | 4194288 |
| 2113 | >>> l.verboseRead(BoolCode('LAST', description="Last block")) |
| 2114 | 1 LAST Last block: 1: True |
| 2115 | True |
| 2116 | >>> l.verboseRead(BoolCode('EMPTY', description="Empty block")) |
| 2117 | 0 EMPTY Empty block: 0: False |
| 2118 | False |
| 2119 | >>> l.verboseRead(MetablockLengthAlphabet()) |
| 2120 | 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 |
| 2121 | 47 |
| 2122 | >>> olleke.pos = 76 |
| 2123 | >>> l = Layout(olleke) |
| 2124 | >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True) |
| 2125 | 000a 82 10|1100 D0 10[15*x]-3 |
| 2126 | >>> x.explanation(0x86a3) |
| 2127 | '10[1000011010100011]-3: [0]+100000' |
| 2128 | """, |
| 2129 | |
| 2130 | 'olleke': """ |
| 2131 | >>> olleke.pos = 0 |
| 2132 | >>> try: Layout(olleke).processStream() |
| 2133 | ... except NotImplementedError: pass |
| 2134 | ... #doctest: +REPORT_NDIFF |
| 2135 | addr hex binary context explanation |
| 2136 | -----------------------Stream header------------------------ |
| 2137 | 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| 2138 | ======================Metablock header====================== |
| 2139 | 1 LAST Last block: 1: True |
| 2140 | 0 EMPTY Empty block: 0: False |
| 2141 | 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 |
| 2142 | -------------------Block type descriptors------------------- |
| 2143 | 0003 00 0 BT#L 0: 1 literal block type |
| 2144 | 0 BT#I 0: 1 insert© block type |
| 2145 | 0 BT#D 0: 1 distance block type |
| 2146 | ------------------Distance code parameters------------------ |
| 2147 | 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes |
| 2148 | -----------------------Context modes------------------------ |
| 2149 | 10 LC0 Context mode for type 0: 2(UTF8) |
| 2150 | ------------------------Context maps------------------------ |
| 2151 | 0 LT# 0: 1 literal prefix tree |
| 2152 | 0 DT# 0: 1 distance prefix tree |
| 2153 | ---------------------Prefix code lists---------------------- |
| 2154 | 10 PFX L0 is complex with lengths 3,4,0,5,17... |
| 2155 | 0005 4f 1|0 ##L0 len 3: coded with 3 bits |
| 2156 | 0111 ##L0 len 4: coded with 1 bits |
| 2157 | 10 ##L0 unused: coded with 3 bits |
| 2158 | 0006 d6 0|0 ##L0 len 5: skipped |
| 2159 | 011 ##L0 zero xxx: coded with 2 bits |
| 2160 | ***** Lengths for L0 will be coded as: |
| 2161 | 0:len 4 01:zero xxx 011:unused 111:len 3 |
| 2162 | 0007 95 1|11,01 #L0 7+3 unused |
| 2163 | 0 #L0 Length for \\n is 4 bits |
| 2164 | 001,01 #L0 1+3 unused |
| 2165 | 0008 44 010,0|1 #L0 total 19+2 unused |
| 2166 | 0 #L0 Length for " " is 4 bits |
| 2167 | 0 #L0 Length for ! is 4 bits |
| 2168 | 0009 cb 011,|01 #L0 3+3 unused |
| 2169 | |110,01 #L0 total 35+6 unused |
| 2170 | 000a 82 0 #L0 Length for K is 4 bits |
| 2171 | 000,01 #L0 0+3 unused |
| 2172 | 0 #L0 Length for O is 4 bits |
| 2173 | 000b 4d 01|1 #L0 symbol P unused |
| 2174 | 011 #L0 symbol Q unused |
| 2175 | 0 #L0 Length for R is 4 bits |
| 2176 | 000c 88 000,|01 #L0 0+3 unused |
| 2177 | |100,01 #L0 total 11+4 unused |
| 2178 | 000d b6 0 #L0 Length for b is 4 bits |
| 2179 | 011 #L0 symbol c unused |
| 2180 | 011 #L0 symbol d unused |
| 2181 | 000e 27 11|1 #L0 Length for e is 3 bits |
| 2182 | 010,01 #L0 2+3 unused |
| 2183 | |0 #L0 Length for k is 4 bits |
| 2184 | 000f 1f 111 #L0 Length for l is 3 bits |
| 2185 | 011 #L0 symbol m unused |
| 2186 | 0 #L0 Length for n is 4 bits |
| 2187 | |0 #L0 Length for o is 4 bits |
| 2188 | 0010 c1 000,01 #L0 0+3 unused |
| 2189 | 0 #L0 Length for s is 4 bits |
| 2190 | 0011 b4 0|11 #L0 symbol t unused |
| 2191 | 0 #L0 Length for u is 4 bits |
| 2192 | End of table. Prefix code L0: |
| 2193 | 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s |
| 2194 | 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u |
| 2195 | 11,01 PFX IC0 is simple with 4 code words |
| 2196 | 0012 2a |2Ah|10 IC0 ? bits: I5C4 |
| 2197 | 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7 |
| 2198 | 0015 22 0010|111011 IC0 ? bits: I8+xC5 |
| 2199 | 0016 8c 001100|0010 IC0 ? bits: I0C14+xx |
| 2200 | 0 SHAPE False: lengths 2,2,2,2 |
| 2201 | 0017 74 10,0|1 PFX D0 is simple with 3 code words |
| 2202 | 0018 a6 0|01110 D0 1 bit: 2last-3 |
| 2203 | 010011 D0 2 bits: 11xx-3 |
| 2204 | 0019 aa 01010|1 D0 2 bits: 11xxx-3 |
| 2205 | ====================Meta block contents===================== |
| 2206 | |1,01 IC0 Literal: 9, copy: 5 |
| 2207 | 001a 41 0001 0,0=L0 O |
| 2208 | 100 0,48=L0 l |
| 2209 | 001b a2 10|0 0,62=L0 l |
| 2210 | 000 0,63=L0 e |
| 2211 | 001c a1 1|101 0,59=L0 k |
| 2212 | 000 0,63=L0 e |
| 2213 | |1010 0,59=L0 " " |
| 2214 | 001d b5 0101 0,11=L0 b |
| 2215 | |1011 0,60=L0 o |
| 2216 | 001e 24 0 0,3=D0 Distance: 2last-3=8 |
| 2217 | Seen before: "lleke" |
| 2218 | 0,10 IC0 Literal: 6, copy: 7 |
| 2219 | |0010 0,59=L0 \\n |
| 2220 | 001f 89 1001 0,7=L0 R |
| 2221 | 000 0,52=L0 e |
| 2222 | 0020 fa 010|1 0,58=L0 b |
| 2223 | 1111 0,63=L0 u |
| 2224 | 0021 eb 011|1 0,59=L0 s |
| 2225 | 11,01 0,3=D0 Absolute value: 12 (pos 8) |
| 2226 | Seen before: "olleke\\n" |
| 2227 | 0022 db 01,1|1 IC0 Literal: 0, copy: 15 |
| 2228 | |110,11 0,3=D0 Absolute value: 27 (pos 0) |
| 2229 | Seen before: "Olleke bolleke\\n" |
| 2230 | 0023 f8 00 IC0 Literal: 5, copy: 4 |
| 2231 | 1110 0,7=L0 K |
| 2232 | 0024 2c 00|11 0,52=L0 n |
| 2233 | 1011 0,62=L0 o |
| 2234 | 0025 0d 1|00 0,59=L0 l |
| 2235 | 0110 0,63=L0 ! |
| 2236 | """, |
| 2237 | |
| 2238 | 'file': """ |
| 2239 | >>> try: Layout(BitStream( |
| 2240 | ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb') |
| 2241 | ... .read())).processStream() |
| 2242 | ... except NotImplementedError: pass |
| 2243 | addr hex binary context explanation |
| 2244 | -----------------------Stream header------------------------ |
| 2245 | 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| 2246 | ======================Metablock header====================== |
| 2247 | 1 LAST Last block: 1: True |
| 2248 | 0 EMPTY Empty block: 0: False |
| 2249 | 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 |
| 2250 | -------------------Block type descriptors------------------- |
| 2251 | 0003 00 0 BT#L 0: 1 literal block type |
| 2252 | 0 BT#I 0: 1 insert© block type |
| 2253 | 0 BT#D 0: 1 distance block type |
| 2254 | ------------------Distance code parameters------------------ |
| 2255 | 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes |
| 2256 | -----------------------Context modes------------------------ |
| 2257 | 10 LC0 Context mode for type 0: 2(UTF8) |
| 2258 | ------------------------Context maps------------------------ |
| 2259 | 0 LT# 0: 1 literal prefix tree |
| 2260 | 0 DT# 0: 1 distance prefix tree |
| 2261 | ---------------------Prefix code lists---------------------- |
| 2262 | 0005 b0 0|1,01 PFX L0 is simple with 2 code words |
| 2263 | 0006 b2 0|1011000 L0 1 bit: X |
| 2264 | 0007 ea 0|1011001 L0 1 bit: Y |
| 2265 | 01,01 PFX IC0 is simple with 2 code words |
| 2266 | 0008 81 0000001|111 IC0 1 bit: I1C9&D=0 |
| 2267 | 0009 47 02 0|47h|1 IC0 1 bit: I1C9 |
| 2268 | 00,01 PFX D0 is simple with 1 code word |
| 2269 | 000b 8a 010|000 D0 0 bits: 10x-3 |
| 2270 | ====================Meta block contents===================== |
| 2271 | 1 IC0 Literal: 1, copy: 9 |
| 2272 | 0 0,0=L0 X |
| 2273 | 0,() 0,3=D0 Absolute value: 1 (pos 0) |
| 2274 | Seen before: "XXXXXXXXX" |
| 2275 | 0 IC0 Literal: 1, copy: 9, same distance |
| 2276 | |1 0,54=L0 Y |
| 2277 | Seen before: "YYYYYYYYY" |
| 2278 | """, |
| 2279 | |
| 2280 | 'XY': """ |
| 2281 | >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream() |
| 2282 | ... except NotImplementedError: pass |
| 2283 | addr hex binary context explanation |
| 2284 | -----------------------Stream header------------------------ |
| 2285 | 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| 2286 | ======================Metablock header====================== |
| 2287 | 1 LAST Last block: 1: True |
| 2288 | 0 EMPTY Empty block: 0: False |
| 2289 | 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 |
| 2290 | -------------------Block type descriptors------------------- |
| 2291 | 0003 00 0 BT#L 0: 1 literal block type |
| 2292 | 0 BT#I 0: 1 insert© block type |
| 2293 | 0 BT#D 0: 1 distance block type |
| 2294 | ------------------Distance code parameters------------------ |
| 2295 | 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes |
| 2296 | -----------------------Context modes------------------------ |
| 2297 | 10 LC0 Context mode for type 0: 2(UTF8) |
| 2298 | ------------------------Context maps------------------------ |
| 2299 | 0 LT# 0: 1 literal prefix tree |
| 2300 | 0 DT# 0: 1 distance prefix tree |
| 2301 | ---------------------Prefix code lists---------------------- |
| 2302 | 0005 b0 0|1,01 PFX L0 is simple with 2 code words |
| 2303 | 0006 b2 0|1011000 L0 1 bit: X |
| 2304 | 0007 82 0|1011001 L0 1 bit: Y |
| 2305 | 00,01 PFX IC0 is simple with 1 code word |
| 2306 | 0008 84 0000100|100 IC0 0 bits: I4C6&D=0 |
| 2307 | 0009 00 00,0|1 PFX D0 is simple with 1 code word |
| 2308 | 000a e0 0|00000 D0 0 bits: last |
| 2309 | ====================Meta block contents===================== |
| 2310 | () IC0 Literal: 4, copy: 6, same distance |
| 2311 | 0 0,0=L0 X |
| 2312 | 0 0,52=L0 X |
| 2313 | 0 0,54=L0 X |
| 2314 | 0 0,54=L0 X |
| 2315 | Seen before: "XXXXXX" |
| 2316 | () IC0 Literal: 4, copy: 6, same distance |
| 2317 | 1 0,54=L0 Y |
| 2318 | 1 0,54=L0 Y |
| 2319 | |1 0,54=L0 Y |
| 2320 | 000b 01 1 0,54=L0 Y |
| 2321 | Seen before: "YYYYYY" |
| 2322 | """, |
| 2323 | |
| 2324 | 'empty': """ |
| 2325 | >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream() |
| 2326 | ... except NotImplementedError: pass |
| 2327 | addr hex binary context explanation |
| 2328 | -----------------------Stream header------------------------ |
| 2329 | 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056 |
| 2330 | ======================Metablock header====================== |
| 2331 | |1 LAST Last block: 1: True |
| 2332 | 0001 16 0 EMPTY Empty block: 0: False |
| 2333 | 11 MLEN 11: empty block |
| 2334 | 0 RSVD Reserved (must be zero) |
| 2335 | 0002 00 000000|00,01 SKIP skip length: 0h+1=1 |
| 2336 | |00 SKIP 2 bits ignored |
| 2337 | Skipping to 4 |
| 2338 | """, |
| 2339 | |
| 2340 | } |
| 2341 | |
| 2342 | if __name__=='__main__': |
| 2343 | import sys |
| 2344 | if len(sys.argv)>1: |
| 2345 | l = Layout(BitStream(open(sys.argv[1],'rb').read())) |
| 2346 | l.processStream() |
| 2347 | else: |
| 2348 | sys.path.append("h:/Persoonlijk/bin") |
| 2349 | try: |
| 2350 | import brotli |
| 2351 | open('brotlidump.br', 'wb').write( |
| 2352 | brotli.compress( |
| 2353 | open('brotlidump.py', 'r').read() |
| 2354 | )) |
| 2355 | olleke = BitStream(brotli.compress( |
| 2356 | 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!')) |
| 2357 | except ImportError: pass |
| 2358 | import doctest |
| 2359 | doctest.testmod(optionflags=doctest.REPORT_NDIFF |
| 2360 | #|doctest.FAIL_FAST |
| 2361 | ) |