Greatly improve performance of Huffman decoding


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@64 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/jdhuff.c b/jdhuff.c
index b5ba39f..18a0c7e 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -14,6 +14,21 @@
  * storage only upon successful completion of an MCU.
  */
 
+/* Modifications:
+ * Copyright (C)2007 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
@@ -234,7 +249,8 @@
    * with that code.
    */
 
-  MEMZERO(dtbl->look_nbits, SIZEOF(dtbl->look_nbits));
+   for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
+     dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
 
   p = 0;
   for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
@@ -243,8 +259,7 @@
       /* Generate left-justified code followed by all possible bit sequences */
       lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
       for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
-	dtbl->look_nbits[lookbits] = l;
-	dtbl->look_sym[lookbits] = htbl->huffval[p];
+	dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
 	lookbits++;
       }
     }
@@ -438,9 +453,10 @@
  * On some machines, a shift and add will be faster than a table lookup.
  */
 
+#define AVOID_TABLES
 #ifdef AVOID_TABLES
 
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
+#define HUFF_EXTEND(x,s)  ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((-1)<<(s)) + 1)))
 
 #else
 
@@ -498,6 +514,236 @@
 }
 
 
+LOCAL(boolean)
+decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  BITREAD_STATE_VARS;
+  int blkn;
+  savable_state state;
+  /* Outer loop handles each block in the MCU */
+
+  /* Load up working state */
+  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+  ASSIGN_STATE(state, entropy->saved);
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    JBLOCKROW block = MCU_data[blkn];
+    d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
+    d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+    register int s, k, r;
+
+    /* Decode a single block's worth of coefficients */
+
+    /* Section F.2.2.1: decode the DC coefficient difference */
+    HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
+    if (s) {
+      CHECK_BIT_BUFFER(br_state, s, return FALSE);
+      r = GET_BITS(s);
+      s = HUFF_EXTEND(r, s);
+    }
+
+    if (entropy->dc_needed[blkn]) {
+      /* Convert DC difference to actual value, update last_dc_val */
+      int ci = cinfo->MCU_membership[blkn];
+      s += state.last_dc_val[ci];
+      state.last_dc_val[ci] = s;
+      /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+      (*block)[0] = (JCOEF) s;
+    }
+
+    if (entropy->ac_needed[blkn]) {
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* Since zeroes are skipped, output area must be cleared beforehand */
+      for (k = 1; k < DCTSIZE2; k++) {
+        HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
+
+        r = s >> 4;
+        s &= 15;
+      
+        if (s) {
+          k += r;
+          CHECK_BIT_BUFFER(br_state, s, return FALSE);
+          r = GET_BITS(s);
+          s = HUFF_EXTEND(r, s);
+          /* Output coefficient in natural (dezigzagged) order.
+           * Note: the extra entries in jpeg_natural_order[] will save us
+           * if k >= DCTSIZE2, which could happen if the data is corrupted.
+           */
+          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+        } else {
+          if (r != 15)
+            break;
+          k += 15;
+        }
+      }
+
+    } else {
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* In this path we just discard the values */
+      for (k = 1; k < DCTSIZE2; k++) {
+        HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
+
+        r = s >> 4;
+        s &= 15;
+
+        if (s) {
+          k += r;
+          CHECK_BIT_BUFFER(br_state, s, return FALSE);
+          DROP_BITS(s);
+        } else {
+          if (r != 15)
+            break;
+          k += 15;
+        }
+      }
+    }
+  }
+
+  /* Completed MCU, so update state */
+  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  ASSIGN_STATE(entropy->saved, state);
+  return TRUE;
+}
+
+
+/***************************************************************/
+
+#define ADD_BYTE  {                                     \
+  int val0 = *(buffer++);                               \
+  int val1 = *(buffer);                                 \
+                                                        \
+  bits_left += 8;                                       \
+  get_buffer = (get_buffer << 8) | (val0);              \
+  if (val0 == 0xFF) {                                   \
+    buffer++;                                           \
+    if (val1 != 0) {                                    \
+      buffer   -= 2;                                    \
+      get_buffer      &= ~0xFF;                         \
+    }                                                   \
+  }                                                     \
+}
+
+/***************************************************************/
+
+#if __WORDSIZE == 64
+
+#define ENSURE_SHORT \
+  if (bits_left < 16) { \
+    ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE \
+  }
+
+#else
+
+#define ENSURE_SHORT  if (bits_left < 16) { ADD_BYTE ADD_BYTE }
+
+#endif
+
+/***************************************************************/
+
+#define HUFF_DECODE_FAST(symbol, size, htbl) { \
+  ENSURE_SHORT \
+  symbol = PEEK_BITS(HUFF_LOOKAHEAD); \
+  symbol = htbl->lookup[symbol]; \
+  size = symbol >> 8; \
+  bits_left -= size; \
+  symbol = symbol & ((1 << HUFF_LOOKAHEAD) - 1); \
+  if (size == HUFF_LOOKAHEAD + 1) { \
+    symbol = (get_buffer >> bits_left) & ((1 << (size)) - 1); \
+    while (symbol > htbl->maxcode[size]) { \
+      symbol <<= 1; \
+      symbol |= GET_BITS(1); \
+      size++; \
+    } \
+    symbol = htbl->pub->huffval[ (int) (symbol + htbl->valoffset[size]) ]; \
+  } \
+}
+
+/***************************************************************/
+
+LOCAL(boolean)
+decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  BITREAD_STATE_VARS;
+  JOCTET *buffer;
+  int blkn;
+  savable_state state;
+  /* Outer loop handles each block in the MCU */
+
+  /* Load up working state */
+  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+  buffer = (JOCTET *) br_state.next_input_byte;
+  ASSIGN_STATE(state, entropy->saved);
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    JBLOCKROW block = MCU_data[blkn];
+    d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
+    d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+    register int s, k, r, l;
+
+    HUFF_DECODE_FAST(s, l, dctbl);
+    if (s) {
+      ENSURE_SHORT
+      r = GET_BITS(s);
+      s = HUFF_EXTEND(r, s);
+    }
+
+    if (entropy->dc_needed[blkn]) {
+      int ci = cinfo->MCU_membership[blkn];
+      s += state.last_dc_val[ci];
+      state.last_dc_val[ci] = s;
+      (*block)[0] = (JCOEF) s;
+    }
+
+    if (entropy->ac_needed[blkn]) {
+
+      for (k = 1; k < DCTSIZE2; k++) {
+        HUFF_DECODE_FAST(s, l, actbl);
+        r = s >> 4;
+        s &= 15;
+      
+        if (s) {
+          k += r;
+          ENSURE_SHORT
+          r = GET_BITS(s);
+          s = HUFF_EXTEND(r, s);
+          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+        } else {
+          if (r != 15) break;
+          k += 15;
+        }
+      }
+
+    } else {
+
+      for (k = 1; k < DCTSIZE2; k++) {
+        HUFF_DECODE_FAST(s, l, actbl);
+        r = s >> 4;
+        s &= 15;
+
+        if (s) {
+          k += r;
+          ENSURE_SHORT
+          DROP_BITS(s);
+        } else {
+          if (r != 15) break;
+          k += 15;
+        }
+      }
+    }
+  }
+
+  br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
+  br_state.next_input_byte = buffer;
+  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  ASSIGN_STATE(entropy->saved, state);
+  return TRUE;
+}
+
+
 /*
  * Decode and return one MCU's worth of Huffman-compressed coefficients.
  * The coefficients are reordered from zigzag order into natural array order,
@@ -513,13 +759,12 @@
  * this module, since we'll just re-assign them on the next call.)
  */
 
+#define BUFSIZE (DCTSIZE2 * 2)
+
 METHODDEF(boolean)
 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int blkn;
-  BITREAD_STATE_VARS;
-  savable_state state;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -533,91 +778,13 @@
    */
   if (! entropy->pub.insufficient_data) {
 
-    /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
-
-    /* Outer loop handles each block in the MCU */
-
-    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      JBLOCKROW block = MCU_data[blkn];
-      d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-      d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
-      register int s, k, r;
-
-      /* Decode a single block's worth of coefficients */
-
-      /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
-      if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
-      }
-
-      if (entropy->dc_needed[blkn]) {
-	/* Convert DC difference to actual value, update last_dc_val */
-	int ci = cinfo->MCU_membership[blkn];
-	s += state.last_dc_val[ci];
-	state.last_dc_val[ci] = s;
-	/* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
-	(*block)[0] = (JCOEF) s;
-      }
-
-      if (entropy->ac_needed[blkn]) {
-
-	/* Section F.2.2.2: decode the AC coefficients */
-	/* Since zeroes are skipped, output area must be cleared beforehand */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
-	  if (s) {
-	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    r = GET_BITS(s);
-	    s = HUFF_EXTEND(r, s);
-	    /* Output coefficient in natural (dezigzagged) order.
-	     * Note: the extra entries in jpeg_natural_order[] will save us
-	     * if k >= DCTSIZE2, which could happen if the data is corrupted.
-	     */
-	    (*block)[jpeg_natural_order[k]] = (JCOEF) s;
-	  } else {
-	    if (r != 15)
-	      break;
-	    k += 15;
-	  }
-	}
-
-      } else {
-
-	/* Section F.2.2.2: decode the AC coefficients */
-	/* In this path we just discard the values */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
-	  if (s) {
-	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    DROP_BITS(s);
-	  } else {
-	    if (r != 15)
-	      break;
-	    k += 15;
-	  }
-	}
-
-      }
+    if (cinfo->src->bytes_in_buffer >= BUFSIZE) {
+      if (!decode_mcu_fast(cinfo, MCU_data)) return FALSE;
+    }
+    else {
+      if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
     }
 
-    /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
   }
 
   /* Account for restart interval (no-op if not using restarts) */
diff --git a/jdhuff.h b/jdhuff.h
index ae19b6c..8e50bbf 100644
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -36,13 +36,17 @@
   /* Link to public Huffman table (needed only in jpeg_huff_decode) */
   JHUFF_TBL *pub;
 
-  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+  /* Lookahead table: indexed by the next HUFF_LOOKAHEAD bits of
    * the input data stream.  If the next Huffman code is no more
    * than HUFF_LOOKAHEAD bits long, we can obtain its length and
-   * the corresponding symbol directly from these tables.
+   * the corresponding symbol directly from this tables.
+   *
+   * The lower 8 bits of each table entry contain the number of
+   * bits in the corresponding Huffman code, or HUFF_LOOKAHEAD + 1
+   * if too long.  The next 8 bits of each entry contain the
+   * symbol.
    */
-  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
-  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+  int lookup[1<<HUFF_LOOKAHEAD];
 } d_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
@@ -69,8 +73,8 @@
  * necessary.
  */
 
-typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
-#define BIT_BUF_SIZE  32	/* size of buffer in bits */
+typedef long bit_buf_type;		/* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  __WORDSIZE	/* size of buffer in bits */
 
 /* If long is > 32 bits on your machine, and shifting/masking longs is
  * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
@@ -183,11 +187,10 @@
     } \
   } \
   look = PEEK_BITS(HUFF_LOOKAHEAD); \
-  if ((nb = htbl->look_nbits[look]) != 0) { \
+  if ((nb = (htbl->lookup[look] >> HUFF_LOOKAHEAD)) <= HUFF_LOOKAHEAD) { \
     DROP_BITS(nb); \
-    result = htbl->look_sym[look]; \
+    result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
   } else { \
-    nb = HUFF_LOOKAHEAD+1; \
 slowlabel: \
     if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
 	{ failaction; } \