Updates to Brotli compression format, decoder and encoder

This commit contains a batch of changes that were made to the Brotli
compression algorithm in the last month. Most important changes:

   * Updated spec
   * Changed Huffman code length alphabet to use run length codes more
     efficiently, based on a suggestion by Robert Obryk
   * Changed encoding of the number of Huffman code lengths (HLEN)
   * Changed encoding of the number of Huffman trees (NTREES)
   * Added support for uncompressed meta-blocks
diff --git a/brotli/enc/encode.cc b/brotli/enc/encode.cc
index 4223ca4..7d54dbe 100644
--- a/brotli/enc/encode.cc
+++ b/brotli/enc/encode.cc
@@ -64,21 +64,32 @@
   return retval;
 }
 
-void EncodeSize(size_t len, int* storage_ix, uint8_t* storage) {
-  std::vector<uint8_t> len_bytes;
-  do {
-    len_bytes.push_back(len & 0xff);
-    len >>= 8;
-  } while (len > 0);
-  WriteBits(3, len_bytes.size(), storage_ix, storage);
-  for (int i = 0; i < len_bytes.size(); ++i) {
-    WriteBits(8, len_bytes[i], storage_ix, storage);
+void EncodeVarLenUint8(int n, int* storage_ix, uint8_t* storage) {
+  if (n == 0) {
+    WriteBits(1, 0, storage_ix, storage);
+  } else {
+    WriteBits(1, 1, storage_ix, storage);
+    int nbits = Log2Floor(n);
+    WriteBits(3, nbits, storage_ix, storage);
+    if (nbits > 0) {
+      WriteBits(nbits, n - (1 << nbits), storage_ix, storage);
+    }
   }
 }
 
 void EncodeMetaBlockLength(size_t meta_block_size,
+                           bool is_last,
+                           bool is_uncompressed,
                            int* storage_ix, uint8_t* storage) {
-  WriteBits(1, 0, storage_ix, storage);
+  WriteBits(1, is_last, storage_ix, storage);
+  if (is_last) {
+    if (meta_block_size == 0) {
+      WriteBits(1, 1, storage_ix, storage);
+      return;
+    }
+    WriteBits(1, 0, storage_ix, storage);
+  }
+  --meta_block_size;
   int num_bits = Log2Floor(meta_block_size) + 1;
   if (num_bits < 16) {
     num_bits = 16;
@@ -89,6 +100,9 @@
     meta_block_size >>= 4;
     num_bits -= 4;
   }
+  if (!is_last) {
+    WriteBits(1, is_uncompressed, storage_ix, storage);
+  }
 }
 
 template<int kSize>
@@ -104,16 +118,16 @@
     const uint8_t* code_length_bitdepth,
     int* storage_ix, uint8_t* storage) {
   static const uint8_t kStorageOrder[kCodeLengthCodes] = {
-    1, 2, 3, 4, 0, 17, 18, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    1, 2, 3, 4, 0, 17, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
   };
   // Throw away trailing zeros:
   int codes_to_store = kCodeLengthCodes;
-  for (; codes_to_store > 4; --codes_to_store) {
+  for (; codes_to_store > 3; --codes_to_store) {
     if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
       break;
     }
   }
-  WriteBits(4, codes_to_store - 4, storage_ix, storage);
+  WriteBits(4, codes_to_store - 3, storage_ix, storage);
   const int skip_two_first =
       code_length_bitdepth[kStorageOrder[0]] == 0 &&
       code_length_bitdepth[kStorageOrder[1]] == 0;
@@ -144,9 +158,6 @@
       case 17:
         WriteBits(3, extra_bits, storage_ix, storage);
         break;
-      case 18:
-        WriteBits(7, extra_bits, storage_ix, storage);
-        break;
     }
   }
 }
@@ -225,16 +236,16 @@
   }
   int trimmed_size = trimmed_histogram.total_count_;
   bool write_length = false;
-  if (trimmed_size > 1 && trimmed_size < huffman_tree_size) {
+  if (trimmed_size >= 4 && trimmed_size <= 195 &&
+      trimmed_size < huffman_tree_size) {
     EntropyCode<kCodeLengthCodes> trimmed_entropy;
     BuildEntropyCode(trimmed_histogram, 5, kCodeLengthCodes, &trimmed_entropy);
     int huffman_bit_cost = HuffmanTreeBitCost(huffman_tree_histogram,
                                               huffman_tree_entropy);
     int trimmed_bit_cost = HuffmanTreeBitCost(trimmed_histogram,
                                               trimmed_entropy);;
-    const int nbits = Log2Ceiling(trimmed_size - 1);
-    const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2;
-    if (trimmed_bit_cost + 3 + 2 * nbitpairs < huffman_bit_cost) {
+    trimmed_bit_cost += (trimmed_size < 68 ? 7 : 8);
+    if (trimmed_bit_cost < huffman_bit_cost) {
       write_length = true;
       huffman_tree_size = trimmed_size;
       huffman_tree_entropy = trimmed_entropy;
@@ -245,10 +256,12 @@
       &huffman_tree_entropy.depth_[0], storage_ix, storage);
   WriteBits(1, write_length, storage_ix, storage);
   if (write_length) {
-    const int nbits = Log2Ceiling(huffman_tree_size - 1);
-    const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2;
-    WriteBits(3, nbitpairs - 1, storage_ix, storage);
-    WriteBits(nbitpairs * 2, huffman_tree_size - 2, storage_ix, storage);
+    WriteBits(1, huffman_tree_size >= 68, storage_ix, storage);
+    if (huffman_tree_size < 68) {
+      WriteBits(6, huffman_tree_size - 4, storage_ix, storage);
+    } else {
+      WriteBits(7, huffman_tree_size - 68, storage_ix, storage);
+    }
   }
   StoreHuffmanTreeToBitMask(&huffman_tree[0], &huffman_tree_extra_bits[0],
                             huffman_tree_size, huffman_tree_entropy,
@@ -464,7 +477,7 @@
 void EncodeContextMap(const std::vector<int>& context_map,
                       int num_clusters,
                       int* storage_ix, uint8_t* storage) {
-  WriteBits(8, num_clusters - 1, storage_ix, storage);
+  EncodeVarLenUint8(num_clusters - 1, storage_ix, storage);
 
   if (num_clusters == 1) {
     return;
@@ -476,11 +489,11 @@
   int max_run_length_prefix = BestMaxZeroRunLengthPrefix(transformed_symbols);
   RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix,
                      &rle_symbols, &extra_bits);
-  HistogramLiteral symbol_histogram;
+  HistogramContextMap symbol_histogram;
   for (int i = 0; i < rle_symbols.size(); ++i) {
     symbol_histogram.Add(rle_symbols[i]);
   }
-  EntropyCodeLiteral symbol_code;
+  EntropyCodeContextMap symbol_code;
   BuildEntropyCode(symbol_histogram, 15, num_clusters + max_run_length_prefix,
                    &symbol_code);
   bool use_rle = max_run_length_prefix > 0;
@@ -510,7 +523,7 @@
 }
 
 struct BlockSplitCode {
-  EntropyCodeLiteral block_type_code;
+  EntropyCodeBlockType block_type_code;
   EntropyCodeBlockLength block_len_code;
 };
 
@@ -553,18 +566,12 @@
 void BuildAndEncodeBlockSplitCode(const BlockSplit& split,
                                   BlockSplitCode* code,
                                   int* storage_ix, uint8_t* storage) {
-  if (split.num_types_ <= 1) {
-    WriteBits(1, 0, storage_ix, storage);
+  EncodeVarLenUint8(split.num_types_ - 1, storage_ix, storage);
+  if (split.num_types_ == 1) {
     return;
   }
-  WriteBits(1, 1, storage_ix, storage);
-  int nbits = Log2Floor(split.num_types_ - 1);
-  WriteBits(3, nbits, storage_ix, storage);
-  if (nbits > 0) {
-    WriteBits(nbits, split.num_types_ - 1 - (1 << nbits), storage_ix, storage);
-  }
 
-  HistogramLiteral type_histo;
+  HistogramBlockType type_histo;
   for (int i = 0; i < split.type_codes_.size(); ++i) {
     type_histo.Add(split.type_codes_[i]);
   }
@@ -591,7 +598,7 @@
     ++it->idx_;
     it->type_ = it->split_.types_[it->idx_];
     it->length_ = it->split_.lengths_[it->idx_];
-    uint8_t type_code = it->split_.type_codes_[it->idx_];
+    int type_code = it->split_.type_codes_[it->idx_];
     EntropyEncode(type_code, code.block_type_code, storage_ix, storage);
     EncodeBlockLength(code.block_len_code, it->length_, storage_ix, storage);
   }
@@ -626,6 +633,9 @@
                     MetaBlock* mb) {
   mb->cmds = cmds;
   mb->params = params;
+  if (cmds.empty()) {
+    return;
+  }
   ComputeCommandPrefixes(&mb->cmds,
                          mb->params.num_direct_distance_codes,
                          mb->params.distance_postfix_bits);
@@ -661,9 +671,8 @@
                   &mb->command_histograms,
                   &distance_histograms);
 
-  // Histogram ids need to fit in one byte and there are 16 ids reserved for
-  // run length codes, which leaves a maximum number of 240 histograms.
-  static const int kMaxNumberOfHistograms = 240;
+  // Histogram ids need to fit in one byte.
+  static const int kMaxNumberOfHistograms = 256;
 
   mb->literal_histograms = literal_histograms;
   ClusterHistograms(literal_histograms,
@@ -692,14 +701,20 @@
 }
 
 void StoreMetaBlock(const MetaBlock& mb,
+                    const bool is_last,
                     const uint8_t* ringbuffer,
                     const size_t mask,
                     size_t* pos,
                     int* storage_ix, uint8_t* storage) {
   size_t length = MetaBlockLength(mb.cmds);
   const size_t end_pos = *pos + length;
-  EncodeMetaBlockLength(length - 1,
+  EncodeMetaBlockLength(length,
+                        is_last,
+                        false,
                         storage_ix, storage);
+  if (length == 0) {
+    return;
+  }
   BlockSplitCode literal_split_code;
   BlockSplitCode command_split_code;
   BlockSplitCode distance_split_code;
@@ -798,42 +813,65 @@
 
 void BrotliCompressor::WriteMetaBlock(const size_t input_size,
                                       const uint8_t* input_buffer,
+                                      const bool is_last,
                                       size_t* encoded_size,
                                       uint8_t* encoded_buffer) {
-  ringbuffer_.Write(input_buffer, input_size);
-  EstimateBitCostsForLiterals(input_pos_, input_size,
-                              kRingBufferMask, ringbuffer_.start(),
-                              &literal_cost_[0]);
   std::vector<Command> commands;
-  CreateBackwardReferences(input_size, input_pos_,
-                           ringbuffer_.start(),
-                           &literal_cost_[0],
-                           kRingBufferMask, kMaxBackwardDistance,
-                           hasher_,
-                           &commands);
-  ComputeDistanceShortCodes(&commands, dist_ringbuffer_,
-                            &dist_ringbuffer_idx_);
+  if (input_size > 0) {
+    ringbuffer_.Write(input_buffer, input_size);
+    EstimateBitCostsForLiterals(input_pos_, input_size,
+                                kRingBufferMask, ringbuffer_.start(),
+                                &literal_cost_[0]);
+    CreateBackwardReferences(input_size, input_pos_,
+                             ringbuffer_.start(),
+                             &literal_cost_[0],
+                             kRingBufferMask, kMaxBackwardDistance,
+                             hasher_,
+                             &commands);
+    ComputeDistanceShortCodes(&commands, dist_ringbuffer_,
+                              &dist_ringbuffer_idx_);
+  }
   EncodingParams params;
   params.num_direct_distance_codes = 12;
   params.distance_postfix_bits = 1;
   params.literal_context_mode = CONTEXT_SIGNED;
+  const int storage_ix0 = storage_ix_;
   MetaBlock mb;
   BuildMetaBlock(params, commands, ringbuffer_.start(), input_pos_,
                  kRingBufferMask, &mb);
-  StoreMetaBlock(mb, ringbuffer_.start(), kRingBufferMask,
+  StoreMetaBlock(mb, is_last, ringbuffer_.start(), kRingBufferMask,
                  &input_pos_, &storage_ix_, storage_);
-  size_t output_size = storage_ix_ >> 3;
-  memcpy(encoded_buffer, storage_, output_size);
-  *encoded_size = output_size;
-  storage_ix_ -= output_size << 3;
-  storage_[storage_ix_ >> 3] = storage_[output_size];
+  size_t output_size = is_last ? ((storage_ix_ + 7) >> 3) : (storage_ix_ >> 3);
+  if (input_size + 4 < output_size) {
+    storage_ix_ = storage_ix0;
+    storage_[storage_ix_ >> 3] &= (1 << (storage_ix_ & 7)) - 1;
+    EncodeMetaBlockLength(input_size, false, true, &storage_ix_, storage_);
+    size_t hdr_size = (storage_ix_ + 7) >> 3;
+    memcpy(encoded_buffer, storage_, hdr_size);
+    memcpy(encoded_buffer + hdr_size, input_buffer, input_size);
+    *encoded_size = hdr_size + input_size;
+    if (is_last) {
+      encoded_buffer[*encoded_size] = 0x3;  // ISLAST, ISEMPTY
+      ++(*encoded_size);
+    }
+    storage_ix_ = 0;
+    storage_[0] = 0;
+  } else {
+    memcpy(encoded_buffer, storage_, output_size);
+    *encoded_size = output_size;
+    if (is_last) {
+      storage_ix_ = 0;
+      storage_[0] = 0;
+    } else {
+      storage_ix_ -= output_size << 3;
+      storage_[storage_ix_ >> 3] = storage_[output_size];
+    }
+  }
 }
 
 void BrotliCompressor::FinishStream(
     size_t* encoded_size, uint8_t* encoded_buffer) {
-  WriteBits(2, 0x3, &storage_ix_, storage_);
-  *encoded_size = (storage_ix_ + 7) >> 3;
-  memcpy(encoded_buffer, storage_, *encoded_size);
+  WriteMetaBlock(0, NULL, true, encoded_size, encoded_buffer);
 }
 
 
@@ -857,21 +895,19 @@
 
   while (input_buffer < input_end) {
     int block_size = max_block_size;
+    bool is_last = false;
     if (block_size >= input_end - input_buffer) {
       block_size = input_end - input_buffer;
+      is_last = true;
     }
     size_t output_size = max_output_size;
-    compressor.WriteMetaBlock(block_size, input_buffer,
+    compressor.WriteMetaBlock(block_size, input_buffer, is_last,
                               &output_size, &encoded_buffer[*encoded_size]);
     input_buffer += block_size;
     *encoded_size += output_size;
     max_output_size -= output_size;
   }
 
-  size_t output_size = max_output_size;
-  compressor.FinishStream(&output_size, &encoded_buffer[*encoded_size]);
-  *encoded_size += output_size;
-
   return 1;
 }