Blame - jcdctmgr.c - platform/external/libjpeg-turbo

blob: 156957ab6677311d13e250503a3875724261dfe8 [file] [log] [blame]

Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	1	/*
				2	* jcdctmgr.c
				3	*
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	4	* Copyright (C) 1994-1996, Thomas G. Lane.
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	5	* Copyright (C) 1999-2006, MIYASAKA Masaru.
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	6	* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	7	* This file is part of the Independent JPEG Group's software.
				8	* For conditions of distribution and use, see the accompanying README file.
				9	*
				10	* This file contains the forward-DCT management logic.
				11	* This code selects a particular DCT implementation to be used,
				12	* and it performs related housekeeping chores including coefficient
				13	* quantization.
				14	*/
				15
				16	#define JPEG_INTERNALS
				17	#include "jinclude.h"
				18	#include "jpeglib.h"
				19	#include "jdct.h" /* Private declarations for DCT subsystem */
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	20	#include "jsimddct.h"
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	21
				22
				23	/* Private subobject for this module */
				24
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	25	typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
				26	typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
				27
				28	typedef JMETHOD(void, convsamp_method_ptr,
				29	(JSAMPARRAY sample_data, JDIMENSION start_col,
				30	DCTELEM * workspace));
				31	typedef JMETHOD(void, float_convsamp_method_ptr,
				32	(JSAMPARRAY sample_data, JDIMENSION start_col,
				33	FAST_FLOAT *workspace));
				34
				35	typedef JMETHOD(void, quantize_method_ptr,
				36	(JCOEFPTR coef_block, DCTELEM * divisors,
				37	DCTELEM * workspace));
				38	typedef JMETHOD(void, float_quantize_method_ptr,
				39	(JCOEFPTR coef_block, FAST_FLOAT * divisors,
				40	FAST_FLOAT * workspace));
				41
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	42	typedef struct {
				43	struct jpeg_forward_dct pub; /* public fields */
				44
				45	/* Pointer to the DCT routine actually in use */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	46	forward_DCT_method_ptr dct;
				47	convsamp_method_ptr convsamp;
				48	quantize_method_ptr quantize;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	49
				50	/* The actual post-DCT divisors --- not identical to the quant table
				51	* entries, because of scaling (especially for an unnormalized DCT).
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	52	* Each table is given in normal array order.
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	53	*/
				54	DCTELEM * divisors[NUM_QUANT_TBLS];
				55
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	56	/* work area for FDCT subroutine */
				57	DCTELEM * workspace;
				58
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	59	#ifdef DCT_FLOAT_SUPPORTED
				60	/* Same as above for the floating-point case. */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	61	float_DCT_method_ptr float_dct;
				62	float_convsamp_method_ptr float_convsamp;
				63	float_quantize_method_ptr float_quantize;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	64	FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	65	FAST_FLOAT * float_workspace;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	66	#endif
				67	} my_fdct_controller;
				68
				69	typedef my_fdct_controller * my_fdct_ptr;
				70
				71
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	72	/*
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	73	* Find the highest bit in an integer through binary search.
				74	*/
				75	LOCAL(int)
DRC	fc5dc4f	2009-10-01 22:26:14 +0000	[diff] [blame]	76	flss (UINT16 val)
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	77	{
				78	int bit;
				79
				80	bit = 16;
				81
				82	if (!val)
				83	return 0;
				84
				85	if (!(val & 0xff00)) {
				86	bit -= 8;
				87	val <<= 8;
				88	}
				89	if (!(val & 0xf000)) {
				90	bit -= 4;
				91	val <<= 4;
				92	}
				93	if (!(val & 0xc000)) {
				94	bit -= 2;
				95	val <<= 2;
				96	}
				97	if (!(val & 0x8000)) {
				98	bit -= 1;
				99	val <<= 1;
				100	}
				101
				102	return bit;
				103	}
				104
				105	/*
				106	* Compute values to do a division using reciprocal.
				107	*
				108	* This implementation is based on an algorithm described in
				109	* "How to optimize for the Pentium family of microprocessors"
				110	* (http://www.agner.org/assem/).
				111	* More information about the basic algorithm can be found in
				112	* the paper "Integer Division Using Reciprocals" by Robert Alverson.
				113	*
				114	* The basic idea is to replace x/d by x * d^-1. In order to store
				115	* d^-1 with enough precision we shift it left a few places. It turns
				116	* out that this algoright gives just enough precision, and also fits
				117	* into DCTELEM:
				118	*
				119	* b = (the number of significant bits in divisor) - 1
				120	* r = (word size) + b
				121	* f = 2^r / divisor
				122	*
				123	* f will not be an integer for most cases, so we need to compensate
				124	* for the rounding error introduced:
				125	*
				126	* no fractional part:
				127	*
				128	* result = input >> r
				129	*
				130	* fractional part of f < 0.5:
				131	*
				132	* round f down to nearest integer
				133	* result = ((input + 1) * f) >> r
				134	*
				135	* fractional part of f > 0.5:
				136	*
				137	* round f up to nearest integer
				138	* result = (input * f) >> r
				139	*
				140	* This is the original algorithm that gives truncated results. But we
				141	* want properly rounded results, so we replace "input" with
				142	* "input + divisor/2".
				143	*
				144	* In order to allow SIMD implementations we also tweak the values to
				145	* allow the same calculation to be made at all times:
				146	*
				147	* dctbl[0] = f rounded to nearest integer
				148	* dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
				149	* dctbl[2] = 1 << ((word size) * 2 - r)
				150	* dctbl[3] = r - (word size)
				151	*
				152	* dctbl[2] is for stupid instruction sets where the shift operation
				153	* isn't member wise (e.g. MMX).
				154	*
				155	* The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
				156	* is that most SIMD implementations have a "multiply and store top
				157	* half" operation.
				158	*
				159	* Lastly, we store each of the values in their own table instead
				160	* of in a consecutive manner, yet again in order to allow SIMD
				161	* routines.
				162	*/
				163	LOCAL(void)
				164	compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
				165	{
				166	UDCTELEM2 fq, fr;
				167	UDCTELEM c;
				168	int b, r;
				169
DRC	fc5dc4f	2009-10-01 22:26:14 +0000	[diff] [blame]	170	b = flss(divisor) - 1;
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	171	r = sizeof(DCTELEM) * 8 + b;
				172
				173	fq = ((UDCTELEM2)1 << r) / divisor;
				174	fr = ((UDCTELEM2)1 << r) % divisor;
				175
				176	c = divisor / 2; /* for rounding */
				177
				178	if (fr == 0) { /* divisor is power of two */
				179	/* fq will be one bit too large to fit in DCTELEM, so adjust */
				180	fq >>= 1;
				181	r--;
				182	} else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
				183	c++;
				184	} else { /* fractional part is > 0.5 */
				185	fq++;
				186	}
				187
				188	dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
				189	dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
				190	dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)82 - r)); /* scale */
				191	dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)8; / shift */
				192	}
				193
				194	/*
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	195	* Initialize for a processing pass.
				196	* Verify that all referenced Q-tables are present, and set up
				197	* the divisor table for each one.
				198	* In the current implementation, DCT of all components is done during
				199	* the first pass, even if only some components will be output in the
				200	* first scan. Hence all components should be examined here.
				201	*/
				202
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	203	METHODDEF(void)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	204	start_pass_fdctmgr (j_compress_ptr cinfo)
				205	{
				206	my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
				207	int ci, qtblno, i;
				208	jpeg_component_info *compptr;
				209	JQUANT_TBL * qtbl;
				210	DCTELEM * dtbl;
				211
				212	for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
				213	ci++, compptr++) {
				214	qtblno = compptr->quant_tbl_no;
				215	/* Make sure specified quantization table is present */
				216	if (qtblno < 0 \|\| qtblno >= NUM_QUANT_TBLS \|\|
				217	cinfo->quant_tbl_ptrs[qtblno] == NULL)
				218	ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
				219	qtbl = cinfo->quant_tbl_ptrs[qtblno];
				220	/* Compute divisors for this quant table */
				221	/* We may do this more than once for same table, but it's not a big deal */
				222	switch (cinfo->dct_method) {
				223	#ifdef DCT_ISLOW_SUPPORTED
				224	case JDCT_ISLOW:
				225	/* For LL&M IDCT method, divisors are equal to raw quantization
				226	* coefficients multiplied by 8 (to counteract scaling).
				227	*/
				228	if (fdct->divisors[qtblno] == NULL) {
				229	fdct->divisors[qtblno] = (DCTELEM *)
				230	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	231	(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	232	}
				233	dtbl = fdct->divisors[qtblno];
				234	for (i = 0; i < DCTSIZE2; i++) {
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	235	compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	236	}
				237	break;
				238	#endif
				239	#ifdef DCT_IFAST_SUPPORTED
				240	case JDCT_IFAST:
				241	{
				242	/* For AA&N IDCT method, divisors are equal to quantization
				243	* coefficients scaled by scalefactor[row]*scalefactor[col], where
				244	* scalefactor[0] = 1
				245	* scalefactor[k] = cos(kPI/16) sqrt(2) for k=1..7
				246	* We apply a further scale factor of 8.
				247	*/
				248	#define CONST_BITS 14
				249	static const INT16 aanscales[DCTSIZE2] = {
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	250	/* precomputed values scaled up by 14 bits */
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	251	16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
				252	22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
				253	21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
				254	19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
				255	16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
				256	12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
				257	8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
				258	4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
				259	};
				260	SHIFT_TEMPS
				261
				262	if (fdct->divisors[qtblno] == NULL) {
				263	fdct->divisors[qtblno] = (DCTELEM *)
				264	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	265	(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	266	}
				267	dtbl = fdct->divisors[qtblno];
				268	for (i = 0; i < DCTSIZE2; i++) {
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	269	compute_reciprocal(
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	270	DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
Thomas G. Lane	bc79e06	1995-08-02 00:00:00 +0000	[diff] [blame]	271	(INT32) aanscales[i]),
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	272	CONST_BITS-3), &dtbl[i]);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	273	}
				274	}
				275	break;
				276	#endif
				277	#ifdef DCT_FLOAT_SUPPORTED
				278	case JDCT_FLOAT:
				279	{
				280	/* For float AA&N IDCT method, divisors are equal to quantization
				281	* coefficients scaled by scalefactor[row]*scalefactor[col], where
				282	* scalefactor[0] = 1
				283	* scalefactor[k] = cos(kPI/16) sqrt(2) for k=1..7
				284	* We apply a further scale factor of 8.
				285	* What's actually stored is 1/divisor so that the inner loop can
				286	* use a multiplication rather than a division.
				287	*/
				288	FAST_FLOAT * fdtbl;
				289	int row, col;
				290	static const double aanscalefactor[DCTSIZE] = {
				291	1.0, 1.387039845, 1.306562965, 1.175875602,
				292	1.0, 0.785694958, 0.541196100, 0.275899379
				293	};
				294
				295	if (fdct->float_divisors[qtblno] == NULL) {
				296	fdct->float_divisors[qtblno] = (FAST_FLOAT *)
				297	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				298	DCTSIZE2 * SIZEOF(FAST_FLOAT));
				299	}
				300	fdtbl = fdct->float_divisors[qtblno];
Thomas G. Lane	bc79e06	1995-08-02 00:00:00 +0000	[diff] [blame]	301	i = 0;
				302	for (row = 0; row < DCTSIZE; row++) {
				303	for (col = 0; col < DCTSIZE; col++) {
				304	fdtbl[i] = (FAST_FLOAT)
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	305	(1.0 / (((double) qtbl->quantval[i] *
Thomas G. Lane	bc79e06	1995-08-02 00:00:00 +0000	[diff] [blame]	306	aanscalefactor[row] * aanscalefactor[col] * 8.0)));
				307	i++;
				308	}
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	309	}
				310	}
				311	break;
				312	#endif
				313	default:
				314	ERREXIT(cinfo, JERR_NOT_COMPILED);
				315	break;
				316	}
				317	}
				318	}
				319
				320
				321	/*
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	322	* Load data into workspace, applying unsigned->signed conversion.
				323	*/
				324
				325	METHODDEF(void)
				326	convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
				327	{
				328	register DCTELEM *workspaceptr;
				329	register JSAMPROW elemptr;
				330	register int elemr;
				331
				332	workspaceptr = workspace;
				333	for (elemr = 0; elemr < DCTSIZE; elemr++) {
				334	elemptr = sample_data[elemr] + start_col;
				335
				336	#if DCTSIZE == 8 /* unroll the inner loop */
				337	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				338	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				339	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				340	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				341	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				342	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				343	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				344	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				345	#else
				346	{
				347	register int elemc;
				348	for (elemc = DCTSIZE; elemc > 0; elemc--)
				349	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				350	}
				351	#endif
				352	}
				353	}
				354
				355
				356	/*
				357	* Quantize/descale the coefficients, and store into coef_blocks[].
				358	*/
				359
				360	METHODDEF(void)
				361	quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
				362	{
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	363	int i;
				364	DCTELEM temp;
				365	UDCTELEM recip, corr, shift;
				366	UDCTELEM2 product;
				367	JCOEFPTR output_ptr = coef_block;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	368
				369	for (i = 0; i < DCTSIZE2; i++) {
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	370	temp = workspace[i];
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	371	recip = divisors[i + DCTSIZE2 * 0];
				372	corr = divisors[i + DCTSIZE2 * 1];
				373	shift = divisors[i + DCTSIZE2 * 3];
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	374
				375	if (temp < 0) {
				376	temp = -temp;
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	377	product = (UDCTELEM2)(temp + corr) * recip;
				378	product >>= shift + sizeof(DCTELEM)*8;
				379	temp = product;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	380	temp = -temp;
				381	} else {
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	382	product = (UDCTELEM2)(temp + corr) * recip;
				383	product >>= shift + sizeof(DCTELEM)*8;
				384	temp = product;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	385	}
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	386
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	387	output_ptr[i] = (JCOEF) temp;
				388	}
				389	}
				390
				391
				392	/*
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	393	* Perform forward DCT on one or more blocks of a component.
				394	*
				395	* The input samples are taken from the sample_data[] array starting at
				396	* position start_row/start_col, and moving to the right for any additional
Thomas G. Lane	bc79e06	1995-08-02 00:00:00 +0000	[diff] [blame]	397	* blocks. The quantized coefficients are returned in coef_blocks[].
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	398	*/
				399
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	400	METHODDEF(void)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	401	forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
				402	JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
				403	JDIMENSION start_row, JDIMENSION start_col,
				404	JDIMENSION num_blocks)
				405	/* This version is used for integer DCT implementations. */
				406	{
				407	/* This routine is heavily used, so it's worth coding it tightly. */
				408	my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	409	DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	410	DCTELEM * workspace;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	411	JDIMENSION bi;
				412
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	413	/* Make sure the compiler doesn't look up these every pass */
				414	forward_DCT_method_ptr do_dct = fdct->dct;
				415	convsamp_method_ptr do_convsamp = fdct->convsamp;
				416	quantize_method_ptr do_quantize = fdct->quantize;
Pierre Ossman	dc5db14	2009-03-13 12:17:26 +0000	[diff] [blame]	417	workspace = fdct->workspace;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	418
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	419	sample_data += start_row; /* fold in the vertical offset once */
				420
				421	for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
				422	/* Load data into workspace, applying unsigned->signed conversion */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	423	(*do_convsamp) (sample_data, start_col, workspace);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	424
				425	/* Perform the DCT */
				426	(*do_dct) (workspace);
				427
				428	/* Quantize/descale the coefficients, and store into coef_blocks[] */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	429	(*do_quantize) (coef_blocks[bi], divisors, workspace);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	430	}
				431	}
				432
				433
				434	#ifdef DCT_FLOAT_SUPPORTED
				435
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	436
				437	METHODDEF(void)
				438	convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
				439	{
				440	register FAST_FLOAT *workspaceptr;
				441	register JSAMPROW elemptr;
				442	register int elemr;
				443
				444	workspaceptr = workspace;
				445	for (elemr = 0; elemr < DCTSIZE; elemr++) {
				446	elemptr = sample_data[elemr] + start_col;
				447	#if DCTSIZE == 8 /* unroll the inner loop */
				448	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				449	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				450	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				451	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				452	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				453	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				454	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				455	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				456	#else
				457	{
				458	register int elemc;
				459	for (elemc = DCTSIZE; elemc > 0; elemc--)
				460	*workspaceptr++ = (FAST_FLOAT)
				461	(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
				462	}
				463	#endif
				464	}
				465	}
				466
				467
				468	METHODDEF(void)
				469	quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
				470	{
				471	register FAST_FLOAT temp;
				472	register int i;
				473	register JCOEFPTR output_ptr = coef_block;
				474
				475	for (i = 0; i < DCTSIZE2; i++) {
				476	/* Apply the quantization and scaling factor */
				477	temp = workspace[i] * divisors[i];
				478
				479	/* Round to nearest integer.
				480	* Since C does not specify the direction of rounding for negative
				481	* quotients, we have to force the dividend positive for portability.
				482	* The maximum coefficient size is +-16K (for 12-bit data), so this
				483	* code should work for either 16-bit or 32-bit ints.
				484	*/
				485	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
				486	}
				487	}
				488
				489
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	490	METHODDEF(void)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	491	forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
				492	JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
				493	JDIMENSION start_row, JDIMENSION start_col,
				494	JDIMENSION num_blocks)
				495	/* This version is used for floating-point DCT implementations. */
				496	{
				497	/* This routine is heavily used, so it's worth coding it tightly. */
				498	my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	499	FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	500	FAST_FLOAT * workspace;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	501	JDIMENSION bi;
				502
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	503
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	504	/* Make sure the compiler doesn't look up these every pass */
				505	float_DCT_method_ptr do_dct = fdct->float_dct;
				506	float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
				507	float_quantize_method_ptr do_quantize = fdct->float_quantize;
Pierre Ossman	dc5db14	2009-03-13 12:17:26 +0000	[diff] [blame]	508	workspace = fdct->float_workspace;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	509
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	510	sample_data += start_row; /* fold in the vertical offset once */
				511
				512	for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
				513	/* Load data into workspace, applying unsigned->signed conversion */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	514	(*do_convsamp) (sample_data, start_col, workspace);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	515
				516	/* Perform the DCT */
				517	(*do_dct) (workspace);
				518
				519	/* Quantize/descale the coefficients, and store into coef_blocks[] */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	520	(*do_quantize) (coef_blocks[bi], divisors, workspace);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	521	}
				522	}
				523
				524	#endif /* DCT_FLOAT_SUPPORTED */
				525
				526
				527	/*
				528	* Initialize FDCT manager.
				529	*/
				530
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	531	GLOBAL(void)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	532	jinit_forward_dct (j_compress_ptr cinfo)
				533	{
				534	my_fdct_ptr fdct;
				535	int i;
				536
				537	fdct = (my_fdct_ptr)
				538	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				539	SIZEOF(my_fdct_controller));
				540	cinfo->fdct = (struct jpeg_forward_dct *) fdct;
				541	fdct->pub.start_pass = start_pass_fdctmgr;
				542
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	543	/* First determine the DCT... */
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	544	switch (cinfo->dct_method) {
				545	#ifdef DCT_ISLOW_SUPPORTED
				546	case JDCT_ISLOW:
				547	fdct->pub.forward_DCT = forward_DCT;
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	548	if (jsimd_can_fdct_islow())
				549	fdct->dct = jsimd_fdct_islow;
				550	else
				551	fdct->dct = jpeg_fdct_islow;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	552	break;
				553	#endif
				554	#ifdef DCT_IFAST_SUPPORTED
				555	case JDCT_IFAST:
				556	fdct->pub.forward_DCT = forward_DCT;
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	557	if (jsimd_can_fdct_ifast())
				558	fdct->dct = jsimd_fdct_ifast;
				559	else
				560	fdct->dct = jpeg_fdct_ifast;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	561	break;
				562	#endif
				563	#ifdef DCT_FLOAT_SUPPORTED
				564	case JDCT_FLOAT:
				565	fdct->pub.forward_DCT = forward_DCT_float;
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	566	if (jsimd_can_fdct_float())
				567	fdct->float_dct = jsimd_fdct_float;
				568	else
				569	fdct->float_dct = jpeg_fdct_float;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	570	break;
				571	#endif
				572	default:
				573	ERREXIT(cinfo, JERR_NOT_COMPILED);
				574	break;
				575	}
				576
				577	/* ...then the supporting stages. */
				578	switch (cinfo->dct_method) {
				579	#ifdef DCT_ISLOW_SUPPORTED
				580	case JDCT_ISLOW:
				581	#endif
				582	#ifdef DCT_IFAST_SUPPORTED
				583	case JDCT_IFAST:
				584	#endif
				585	#if defined(DCT_ISLOW_SUPPORTED) \|\| defined(DCT_IFAST_SUPPORTED)
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	586	if (jsimd_can_convsamp())
				587	fdct->convsamp = jsimd_convsamp;
				588	else
				589	fdct->convsamp = convsamp;
				590	if (jsimd_can_quantize())
				591	fdct->quantize = jsimd_quantize;
				592	else
				593	fdct->quantize = quantize;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	594	break;
				595	#endif
				596	#ifdef DCT_FLOAT_SUPPORTED
				597	case JDCT_FLOAT:
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	598	if (jsimd_can_convsamp_float())
				599	fdct->float_convsamp = jsimd_convsamp_float;
				600	else
				601	fdct->float_convsamp = convsamp_float;
				602	if (jsimd_can_quantize_float())
				603	fdct->float_quantize = jsimd_quantize_float;
				604	else
				605	fdct->float_quantize = quantize_float;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	606	break;
				607	#endif
				608	default:
				609	ERREXIT(cinfo, JERR_NOT_COMPILED);
				610	break;
				611	}
				612
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	613	/* Allocate workspace memory */
				614	#ifdef DCT_FLOAT_SUPPORTED
				615	if (cinfo->dct_method == JDCT_FLOAT)
				616	fdct->float_workspace = (FAST_FLOAT *)
				617	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				618	SIZEOF(FAST_FLOAT) * DCTSIZE2);
				619	else
				620	#endif
				621	fdct->workspace = (DCTELEM *)
				622	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				623	SIZEOF(DCTELEM) * DCTSIZE2);
				624
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	625	/* Mark divisor tables unallocated */
				626	for (i = 0; i < NUM_QUANT_TBLS; i++) {
				627	fdct->divisors[i] = NULL;
				628	#ifdef DCT_FLOAT_SUPPORTED
				629	fdct->float_divisors[i] = NULL;
				630	#endif
				631	}
				632	}