The Independent JPEG Group's JPEG software v3
diff --git a/jfwddct.c b/jfwddct.c
index 21d8448..0ca0e78 100644
--- a/jfwddct.c
+++ b/jfwddct.c
@@ -1,7 +1,7 @@
/*
* jfwddct.c
*
- * Copyright (C) 1991, Thomas G. Lane.
+ * Copyright (C) 1991, 1992, Thomas G. Lane.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -16,25 +16,12 @@
#include "jinclude.h"
-
-/* We assume that right shift corresponds to signed division by 2 with
- * rounding towards minus infinity. This is correct for typical "arithmetic
- * shift" instructions that shift in copies of the sign bit. But some
- * C compilers implement >> with an unsigned shift. For these machines you
- * must define RIGHT_SHIFT_IS_UNSIGNED.
- * RIGHT_SHIFT provides a signed right shift of an INT32 quantity.
- * It is only applied with constant shift counts.
+/*
+ * This routine is specialized to the case DCTSIZE = 8.
*/
-#ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define SHIFT_TEMPS INT32 shift_temp;
-#define RIGHT_SHIFT(x,shft) \
- ((shift_temp = (x)) < 0 ? \
- (shift_temp >> (shft)) | ((~0) << (32-(shft))) : \
- (shift_temp >> (shft)))
-#else
-#define SHIFT_TEMPS
-#define RIGHT_SHIFT(x,shft) ((x) >> (shft))
+#if DCTSIZE != 8
+ Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
#endif
@@ -139,74 +126,6 @@
/*
- * Perform a 1-dimensional DCT.
- * Note that this code is specialized to the case DCTSIZE = 8.
- */
-
-INLINE
-LOCAL void
-fast_dct_8 (DCTELEM *in, int stride)
-{
- /* many tmps have nonoverlapping lifetime -- flashy register colourers
- * should be able to do this lot very well
- */
- INT32 in0, in1, in2, in3, in4, in5, in6, in7;
- INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- INT32 tmp10, tmp11, tmp12, tmp13;
- INT32 tmp14, tmp15, tmp16, tmp17;
- INT32 tmp25, tmp26;
- SHIFT_TEMPS
-
- in0 = in[ 0];
- in1 = in[stride ];
- in2 = in[stride*2];
- in3 = in[stride*3];
- in4 = in[stride*4];
- in5 = in[stride*5];
- in6 = in[stride*6];
- in7 = in[stride*7];
-
- tmp0 = in7 + in0;
- tmp1 = in6 + in1;
- tmp2 = in5 + in2;
- tmp3 = in4 + in3;
- tmp4 = in3 - in4;
- tmp5 = in2 - in5;
- tmp6 = in1 - in6;
- tmp7 = in0 - in7;
-
- tmp10 = tmp3 + tmp0;
- tmp11 = tmp2 + tmp1;
- tmp12 = tmp1 - tmp2;
- tmp13 = tmp0 - tmp3;
-
- in[ 0] = (DCTELEM) UNFIXH((tmp10 + tmp11) * SIN_1_4);
- in[stride*4] = (DCTELEM) UNFIXH((tmp10 - tmp11) * COS_1_4);
-
- in[stride*2] = (DCTELEM) UNFIXH(tmp13*COS_1_8 + tmp12*SIN_1_8);
- in[stride*6] = (DCTELEM) UNFIXH(tmp13*SIN_1_8 - tmp12*COS_1_8);
-
- tmp16 = UNFIXO((tmp6 + tmp5) * SIN_1_4);
- tmp15 = UNFIXO((tmp6 - tmp5) * COS_1_4);
-
- OVERSHIFT(tmp4);
- OVERSHIFT(tmp7);
-
- /* tmp4, tmp7, tmp15, tmp16 are overscaled by OVERSCALE */
-
- tmp14 = tmp4 + tmp15;
- tmp25 = tmp4 - tmp15;
- tmp26 = tmp7 - tmp16;
- tmp17 = tmp7 + tmp16;
-
- in[stride ] = (DCTELEM) UNFIXH(tmp17*OCOS_1_16 + tmp14*OSIN_1_16);
- in[stride*7] = (DCTELEM) UNFIXH(tmp17*OCOS_7_16 - tmp14*OSIN_7_16);
- in[stride*5] = (DCTELEM) UNFIXH(tmp26*OCOS_5_16 + tmp25*OSIN_5_16);
- in[stride*3] = (DCTELEM) UNFIXH(tmp26*OCOS_3_16 - tmp25*OSIN_3_16);
-}
-
-
-/*
* Perform the forward DCT on one block of samples.
*
* A 2-D DCT can be done by 1-D DCT on each row
@@ -216,11 +135,74 @@
GLOBAL void
j_fwd_dct (DCTBLOCK data)
{
- int i;
-
- for (i = 0; i < DCTSIZE; i++)
- fast_dct_8(data+i*DCTSIZE, 1);
+ int pass, rowctr;
+ register DCTELEM *inptr, *outptr;
+ DCTBLOCK workspace;
- for (i = 0; i < DCTSIZE; i++)
- fast_dct_8(data+i, DCTSIZE);
+ /* Each iteration of the inner loop performs one 8-point 1-D DCT.
+ * It reads from a *row* of the input matrix and stores into a *column*
+ * of the output matrix. In the first pass, we read from the data[] array
+ * and store into the local workspace[]. In the second pass, we read from
+ * the workspace[] array and store into data[], thus performing the
+ * equivalent of a columnar DCT pass with no variable array indexing.
+ */
+
+ inptr = data; /* initialize pointers for first pass */
+ outptr = workspace;
+ for (pass = 1; pass >= 0; pass--) {
+ for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+ /* many tmps have nonoverlapping lifetime -- flashy register colourers
+ * should be able to do this lot very well
+ */
+ INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ INT32 tmp10, tmp11, tmp12, tmp13;
+ INT32 tmp14, tmp15, tmp16, tmp17;
+ INT32 tmp25, tmp26;
+ SHIFT_TEMPS
+
+ tmp0 = inptr[7] + inptr[0];
+ tmp1 = inptr[6] + inptr[1];
+ tmp2 = inptr[5] + inptr[2];
+ tmp3 = inptr[4] + inptr[3];
+ tmp4 = inptr[3] - inptr[4];
+ tmp5 = inptr[2] - inptr[5];
+ tmp6 = inptr[1] - inptr[6];
+ tmp7 = inptr[0] - inptr[7];
+
+ tmp10 = tmp3 + tmp0;
+ tmp11 = tmp2 + tmp1;
+ tmp12 = tmp1 - tmp2;
+ tmp13 = tmp0 - tmp3;
+
+ outptr[ 0] = (DCTELEM) UNFIXH((tmp10 + tmp11) * SIN_1_4);
+ outptr[DCTSIZE*4] = (DCTELEM) UNFIXH((tmp10 - tmp11) * COS_1_4);
+
+ outptr[DCTSIZE*2] = (DCTELEM) UNFIXH(tmp13*COS_1_8 + tmp12*SIN_1_8);
+ outptr[DCTSIZE*6] = (DCTELEM) UNFIXH(tmp13*SIN_1_8 - tmp12*COS_1_8);
+
+ tmp16 = UNFIXO((tmp6 + tmp5) * SIN_1_4);
+ tmp15 = UNFIXO((tmp6 - tmp5) * COS_1_4);
+
+ OVERSHIFT(tmp4);
+ OVERSHIFT(tmp7);
+
+ /* tmp4, tmp7, tmp15, tmp16 are overscaled by OVERSCALE */
+
+ tmp14 = tmp4 + tmp15;
+ tmp25 = tmp4 - tmp15;
+ tmp26 = tmp7 - tmp16;
+ tmp17 = tmp7 + tmp16;
+
+ outptr[DCTSIZE ] = (DCTELEM) UNFIXH(tmp17*OCOS_1_16 + tmp14*OSIN_1_16);
+ outptr[DCTSIZE*7] = (DCTELEM) UNFIXH(tmp17*OCOS_7_16 - tmp14*OSIN_7_16);
+ outptr[DCTSIZE*5] = (DCTELEM) UNFIXH(tmp26*OCOS_5_16 + tmp25*OSIN_5_16);
+ outptr[DCTSIZE*3] = (DCTELEM) UNFIXH(tmp26*OCOS_3_16 - tmp25*OSIN_3_16);
+
+ inptr += DCTSIZE; /* advance inptr to next row */
+ outptr++; /* advance outptr to next column */
+ }
+ /* end of pass; in case it was pass 1, set up for pass 2 */
+ inptr = workspace;
+ outptr = data;
+ }
}