The Independent JPEG Group's JPEG software v4a
diff --git a/jdhuff.c b/jdhuff.c
index cdc9bd7..3ac46cf 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -1,7 +1,7 @@
 /*
  * jdhuff.c
  *
- * Copyright (C) 1991, 1992, Thomas G. Lane.
+ * Copyright (C) 1991, 1992, 1993, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -26,6 +26,7 @@
 /* Compute derived values for a Huffman table */
 {
   int p, i, l, si;
+  int lookbits, ctr;
   char huffsize[257];
   UINT16 huffcode[257];
   UINT16 code;
@@ -55,23 +56,43 @@
     si++;
   }
 
-  /* We don't bother to fill in the encoding tables ehufco[] and ehufsi[], */
-  /* since they are not used for decoding. */
-
-  /* Figure F.15: generate decoding tables */
+  /* Figure F.15: generate decoding tables for bit-sequential decoding */
 
   p = 0;
   for (l = 1; l <= 16; l++) {
     if (htbl->bits[l]) {
-      htbl->valptr[l] = p;	/* huffval[] index of 1st sym of code len l */
-      htbl->mincode[l] = huffcode[p]; /* minimum code of length l */
+      htbl->priv.dec.valptr[l] = p; /* huffval[] index of 1st symbol of code length l */
+      htbl->priv.dec.mincode[l] = huffcode[p]; /* minimum code of length l */
       p += htbl->bits[l];
-      htbl->maxcode[l] = huffcode[p-1];	/* maximum code of length l */
+      htbl->priv.dec.maxcode[l] = huffcode[p-1]; /* maximum code of length l */
     } else {
-      htbl->maxcode[l] = -1;
+      htbl->priv.dec.maxcode[l] = -1; /* -1 if no codes of this length */
     }
   }
-  htbl->maxcode[17] = 0xFFFFFL;	/* ensures huff_DECODE terminates */
+  htbl->priv.dec.maxcode[17] = 0xFFFFFL; /* ensures huff_DECODE terminates */
+
+  /* Compute lookahead tables to speed up decoding.
+   * First we set all the table entries to 0, indicating "too long";
+   * then we iterate through the Huffman codes that are short enough and
+   * fill in all the entries that correspond to bit sequences starting
+   * with that code.
+   */
+
+  MEMZERO(htbl->priv.dec.look_nbits, SIZEOF(htbl->priv.dec.look_nbits));
+
+  p = 0;
+  for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
+    for (i = 1; i <= (int) htbl->bits[l]; i++, p++) {
+      /* l = current code's length, p = its index in huffcode[] & huffval[]. */
+      /* Generate left-justified code followed by all possible bit sequences */
+      lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
+      for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
+	htbl->priv.dec.look_nbits[lookbits] = l;
+	htbl->priv.dec.look_sym[lookbits] = htbl->huffval[p];
+	lookbits++;
+      }
+    }
+  }
 }
 
 
@@ -82,11 +103,13 @@
  *
  * We read source bytes into get_buffer and dole out bits as needed.
  * If get_buffer already contains enough bits, they are fetched in-line
- * by the macros get_bits() and get_bit().  When there aren't enough bits,
- * fill_bit_buffer is called; it will attempt to fill get_buffer to the
- * "high water mark", then extract the desired number of bits.  The idea,
- * of course, is to minimize the function-call overhead cost of entering
- * fill_bit_buffer.
+ * by the macros check_bit_buffer and get_bits.  When there aren't enough
+ * bits, fill_bit_buffer is called; it will attempt to fill get_buffer to
+ * the "high water mark" (not just to the number of bits needed; this reduces
+ * the function-call overhead cost of entering fill_bit_buffer).
+ * On return, fill_bit_buffer guarantees that get_buffer contains at least
+ * the requested number of bits --- dummy zeroes are inserted if necessary.
+ *
  * On most machines MIN_GET_BITS should be 25 to allow the full 32-bit width
  * of get_buffer to be used.  (On machines with wider words, an even larger
  * buffer could be used.)  However, on some machines 32-bit shifts are
@@ -102,16 +125,13 @@
 #define MIN_GET_BITS  25	/* max value for 32-bit get_buffer */
 #endif
 
-static const int bmask[16] =	/* bmask[n] is mask for n rightmost bits */
-  { 0, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF,
-    0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF };
 
-
-LOCAL int
+LOCAL void
 fill_bit_buffer (int nbits)
-/* Load up the bit buffer and do get_bits(nbits) */
+/* Load up the bit buffer to a depth of at least nbits */
 {
   /* Attempt to load at least MIN_GET_BITS bits into get_buffer. */
+  /* (It is assumed that no request will be for more than that many bits.) */
   while (bits_left < MIN_GET_BITS) {
     register int c = JGETC(dcinfo);
     
@@ -128,10 +148,11 @@
 	if (bits_left >= nbits)
 	  break;
 	/* Uh-oh.  Report corrupted data to user and stuff zeroes into
-	 * the data stream, so we can produce some kind of image.
+	 * the data stream, so that we can produce some kind of image.
 	 * Note that this will be repeated for each byte demanded for the
 	 * rest of the segment; this is a bit slow but not unreasonably so.
-	 * The main thing is to avoid getting a zillion warnings, hence:
+	 * The main thing is to avoid getting a zillion warnings, hence
+	 * we use a flag to ensure that only one warning appears.
 	 */
 	if (! printed_eod) {
 	  WARNMS(dcinfo->emethods, "Corrupt JPEG data: premature end of data segment");
@@ -145,40 +166,81 @@
     get_buffer = (get_buffer << 8) | c;
     bits_left += 8;
   }
-
-  /* Having filled get_buffer, extract desired bits (this simplifies macros) */
-  bits_left -= nbits;
-  return ((int) (get_buffer >> bits_left)) & bmask[nbits];
 }
 
 
-/* Macros to make things go at some speed! */
-/* NB: parameter to get_bits should be simple variable, not expression */
+/*
+ * These macros provide the in-line portion of bit fetching.
+ * Correct usage is:
+ *	check_bit_buffer(n);		ensure there are N bits in get_buffer
+ *      val = get_bits(n);		fetch N bits
+ * The value n should be a simple variable, not an expression, because it
+ * is evaluated multiple times.
+ * peek_bits() fetches next N bits without removing them from the buffer.
+ */
+
+#define check_bit_buffer(nbits) \
+	{ if (bits_left < (nbits))  fill_bit_buffer(nbits); }
 
 #define get_bits(nbits) \
-	(bits_left >= (nbits) ? \
-	 ((int) (get_buffer >> (bits_left -= (nbits)))) & bmask[nbits] : \
-	 fill_bit_buffer(nbits))
+	(((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
 
-#define get_bit() \
-	(bits_left ? \
-	 ((int) (get_buffer >> (--bits_left))) & 1 : \
-	 fill_bit_buffer(1))
+#define peek_bits(nbits) \
+	(((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
 
 
-/* Figure F.16: extract next coded symbol from input stream */
+/*
+ * Routines to extract next Huffman-coded symbol from input bit stream.
+ * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
+ * without looping.  Usually, more than 95% of the Huffman codes will be 8
+ * or fewer bits long.  The few overlength codes are handled with a loop.
+ * The primary case is made a macro for speed reasons; the secondary
+ * routine slow_DECODE is rarely entered and need not be inline code.
+ *
+ * Notes about the huff_DECODE macro:
+ * 1. The first if-test is coded to call fill_bit_buffer only when necessary.
+ * 2. If the lookahead succeeds, we need only decrement bits_left to remove
+ *    the proper number of bits from get_buffer.
+ * 3. If the lookahead table contains no entry, the next code must be
+ *    more than HUFF_LOOKAHEAD bits long.
+ * 4. Near the end of the data segment, we may fail to get enough bits
+ *    for a lookahead.  In that case, we do it the hard way.
+ */
+
+#define huff_DECODE(htbl,result) \
+{ register int nb, look;					\
+  if (bits_left >= HUFF_LOOKAHEAD ||				\
+      (fill_bit_buffer(0), bits_left >= HUFF_LOOKAHEAD)) {	\
+    look = peek_bits(HUFF_LOOKAHEAD);				\
+    if ((nb = htbl->priv.dec.look_nbits[look]) != 0) {		\
+      bits_left -= nb;						\
+      result = htbl->priv.dec.look_sym[look];			\
+    } else							\
+      result = slow_DECODE(htbl, HUFF_LOOKAHEAD+1);		\
+  } else							\
+    result = slow_DECODE(htbl, 1);				\
+}
+
   
-INLINE
 LOCAL int
-huff_DECODE (HUFF_TBL * htbl)
+slow_DECODE (HUFF_TBL * htbl, int min_bits)
 {
-  register int l;
+  register int l = min_bits;
   register INT32 code;
-  
-  code = get_bit();
-  l = 1;
-  while (code > htbl->maxcode[l]) {
-    code = (code << 1) | get_bit();
+
+  /* huff_DECODE has determined that the code is at least min_bits */
+  /* bits long, so fetch that many bits in one swoop. */
+
+  check_bit_buffer(l);
+  code = get_bits(l);
+
+  /* Collect the rest of the Huffman code one bit at a time. */
+  /* This is per Figure F.16 in the JPEG spec. */
+
+  while (code > htbl->priv.dec.maxcode[l]) {
+    code <<= 1;
+    check_bit_buffer(1);
+    code |= get_bits(1);
     l++;
   }
 
@@ -189,11 +251,20 @@
     return 0;			/* fake a zero as the safest result */
   }
 
-  return htbl->huffval[ htbl->valptr[l] + ((int) (code - htbl->mincode[l])) ];
+  return htbl->huffval[ htbl->priv.dec.valptr[l] +
+		        ((int) (code - htbl->priv.dec.mincode[l])) ];
 }
 
 
-/* Figure F.12: extend sign bit */
+/* Figure F.12: extend sign bit.
+ * On some machines, a shift and add will be faster than a table lookup.
+ */
+
+#ifdef AVOID_TABLES
+
+#define huff_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
+
+#else
 
 #define huff_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
 
@@ -207,6 +278,8 @@
     ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
     ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
 
+#endif /* AVOID_TABLES */
+
 
 /*
  * Initialize for a Huffman-compressed scan.
@@ -214,7 +287,7 @@
  */
 
 METHODDEF void
-huff_decoder_init (decompress_info_ptr cinfo)
+decoder_init (decompress_info_ptr cinfo)
 {
   short ci;
   jpeg_component_info * compptr;
@@ -294,7 +367,7 @@
 
 
 /* ZAG[i] is the natural-order position of the i'th element of zigzag order.
- * If the incoming data is corrupted, huff_decode_mcu could attempt to
+ * If the incoming data is corrupted, decode_mcu could attempt to
  * reference values beyond the end of the array.  To avoid a wild store,
  * we put some extra zeroes after the real entries.
  */
@@ -324,7 +397,7 @@
  */
 
 METHODDEF void
-huff_decode_mcu (decompress_info_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu (decompress_info_ptr cinfo, JBLOCKROW *MCU_data)
 {
   register int s, k, r;
   short blkn, ci;
@@ -354,8 +427,9 @@
     /* Decode a single block's worth of coefficients */
 
     /* Section F.2.2.1: decode the DC coefficient difference */
-    s = huff_DECODE(dctbl);
+    huff_DECODE(dctbl, s);
     if (s) {
+      check_bit_buffer(s);
       r = get_bits(s);
       s = huff_EXTEND(r, s);
     }
@@ -369,13 +443,14 @@
     /* Section F.2.2.2: decode the AC coefficients */
     /* Since zero values are skipped, output area must be zeroed beforehand */
     for (k = 1; k < DCTSIZE2; k++) {
-      r = huff_DECODE(actbl);
+      huff_DECODE(actbl, s);
       
-      s = r & 15;
-      r = r >> 4;
+      r = s >> 4;
+      s &= 15;
       
       if (s) {
 	k += r;
+	check_bit_buffer(s);
 	r = get_bits(s);
 	s = huff_EXTEND(r, s);
 	/* Descale coefficient and output in natural (dezigzagged) order */
@@ -395,7 +470,7 @@
  */
 
 METHODDEF void
-huff_decoder_term (decompress_info_ptr cinfo)
+decoder_term (decompress_info_ptr cinfo)
 {
   /* No work needed */
 }
@@ -409,8 +484,8 @@
 jseldhuffman (decompress_info_ptr cinfo)
 {
   if (! cinfo->arith_code) {
-    cinfo->methods->entropy_decode_init = huff_decoder_init;
-    cinfo->methods->entropy_decode = huff_decode_mcu;
-    cinfo->methods->entropy_decode_term = huff_decoder_term;
+    cinfo->methods->entropy_decode_init = decoder_init;
+    cinfo->methods->entropy_decode = decode_mcu;
+    cinfo->methods->entropy_decode_term = decoder_term;
   }
 }