Blame - jcdctmgr.c - platform/external/libjpeg-turbo

blob: 7d4d3a0673417f433f9fbebc421ae342e6f0fd84 [file] [log] [blame]

Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	1	/*
				2	* jcdctmgr.c
				3	*
DRC	a73e870	2012-12-31 02:52:30 +0000	[diff] [blame]	4	* This file was part of the Independent JPEG Group's software:
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	5	* Copyright (C) 1994-1996, Thomas G. Lane.
DRC	a6ef282	2013-09-28 03:23:49 +0000	[diff] [blame]	6	* libjpeg-turbo Modifications:
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	7	* Copyright (C) 1999-2006, MIYASAKA Masaru.
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	8	* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	9	* Copyright (C) 2011, 2014 D. R. Commander
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	10	* For conditions of distribution and use, see the accompanying README file.
				11	*
				12	* This file contains the forward-DCT management logic.
				13	* This code selects a particular DCT implementation to be used,
				14	* and it performs related housekeeping chores including coefficient
				15	* quantization.
				16	*/
				17
				18	#define JPEG_INTERNALS
				19	#include "jinclude.h"
				20	#include "jpeglib.h"
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	21	#include "jdct.h" /* Private declarations for DCT subsystem */
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	22	#include "jsimddct.h"
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	23
				24
				25	/* Private subobject for this module */
				26
DRC	bc56b75	2014-05-16 10:43:44 +0000	[diff] [blame]	27	typedef void (forward_DCT_method_ptr) (DCTELEM data);
				28	typedef void (float_DCT_method_ptr) (FAST_FLOAT data);
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	29
DRC	bc56b75	2014-05-16 10:43:44 +0000	[diff] [blame]	30	typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
				31	JDIMENSION start_col,
				32	DCTELEM * workspace);
				33	typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data,
				34	JDIMENSION start_col,
				35	FAST_FLOAT *workspace);
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	36
DRC	bc56b75	2014-05-16 10:43:44 +0000	[diff] [blame]	37	typedef void (quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM divisors,
				38	DCTELEM * workspace);
				39	typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
				40	FAST_FLOAT * divisors,
				41	FAST_FLOAT * workspace);
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	42
DRC	a49c4e5	2011-02-18 20:50:08 +0000	[diff] [blame]	43	METHODDEF(void) quantize (JCOEFPTR, DCTELEM , DCTELEM );
				44
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	45	typedef struct {
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	46	struct jpeg_forward_dct pub; /* public fields */
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	47
				48	/* Pointer to the DCT routine actually in use */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	49	forward_DCT_method_ptr dct;
				50	convsamp_method_ptr convsamp;
				51	quantize_method_ptr quantize;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	52
				53	/* The actual post-DCT divisors --- not identical to the quant table
				54	* entries, because of scaling (especially for an unnormalized DCT).
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	55	* Each table is given in normal array order.
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	56	*/
				57	DCTELEM * divisors[NUM_QUANT_TBLS];
				58
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	59	/* work area for FDCT subroutine */
				60	DCTELEM * workspace;
				61
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	62	#ifdef DCT_FLOAT_SUPPORTED
				63	/* Same as above for the floating-point case. */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	64	float_DCT_method_ptr float_dct;
				65	float_convsamp_method_ptr float_convsamp;
				66	float_quantize_method_ptr float_quantize;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	67	FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	68	FAST_FLOAT * float_workspace;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	69	#endif
				70	} my_fdct_controller;
				71
				72	typedef my_fdct_controller * my_fdct_ptr;
				73
				74
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	75	#if BITS_IN_JSAMPLE == 8
				76
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	77	/*
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	78	* Find the highest bit in an integer through binary search.
				79	*/
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	80
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	81	LOCAL(int)
DRC	fc5dc4f	2009-10-01 22:26:14 +0000	[diff] [blame]	82	flss (UINT16 val)
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	83	{
				84	int bit;
				85
				86	bit = 16;
				87
				88	if (!val)
				89	return 0;
				90
				91	if (!(val & 0xff00)) {
				92	bit -= 8;
				93	val <<= 8;
				94	}
				95	if (!(val & 0xf000)) {
				96	bit -= 4;
				97	val <<= 4;
				98	}
				99	if (!(val & 0xc000)) {
				100	bit -= 2;
				101	val <<= 2;
				102	}
				103	if (!(val & 0x8000)) {
				104	bit -= 1;
				105	val <<= 1;
				106	}
				107
				108	return bit;
				109	}
				110
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	111
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	112	/*
				113	* Compute values to do a division using reciprocal.
				114	*
				115	* This implementation is based on an algorithm described in
				116	* "How to optimize for the Pentium family of microprocessors"
				117	* (http://www.agner.org/assem/).
				118	* More information about the basic algorithm can be found in
				119	* the paper "Integer Division Using Reciprocals" by Robert Alverson.
				120	*
				121	* The basic idea is to replace x/d by x * d^-1. In order to store
				122	* d^-1 with enough precision we shift it left a few places. It turns
				123	* out that this algoright gives just enough precision, and also fits
				124	* into DCTELEM:
				125	*
				126	* b = (the number of significant bits in divisor) - 1
				127	* r = (word size) + b
				128	* f = 2^r / divisor
				129	*
				130	* f will not be an integer for most cases, so we need to compensate
				131	* for the rounding error introduced:
				132	*
				133	* no fractional part:
				134	*
				135	* result = input >> r
				136	*
				137	* fractional part of f < 0.5:
				138	*
				139	* round f down to nearest integer
				140	* result = ((input + 1) * f) >> r
				141	*
				142	* fractional part of f > 0.5:
				143	*
				144	* round f up to nearest integer
				145	* result = (input * f) >> r
				146	*
				147	* This is the original algorithm that gives truncated results. But we
				148	* want properly rounded results, so we replace "input" with
				149	* "input + divisor/2".
				150	*
				151	* In order to allow SIMD implementations we also tweak the values to
				152	* allow the same calculation to be made at all times:
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	153	*
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	154	* dctbl[0] = f rounded to nearest integer
				155	* dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
				156	* dctbl[2] = 1 << ((word size) * 2 - r)
				157	* dctbl[3] = r - (word size)
				158	*
				159	* dctbl[2] is for stupid instruction sets where the shift operation
				160	* isn't member wise (e.g. MMX).
				161	*
				162	* The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
				163	* is that most SIMD implementations have a "multiply and store top
				164	* half" operation.
				165	*
				166	* Lastly, we store each of the values in their own table instead
				167	* of in a consecutive manner, yet again in order to allow SIMD
				168	* routines.
				169	*/
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	170
DRC	a49c4e5	2011-02-18 20:50:08 +0000	[diff] [blame]	171	LOCAL(int)
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	172	compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
				173	{
				174	UDCTELEM2 fq, fr;
				175	UDCTELEM c;
				176	int b, r;
				177
DRC	fc5dc4f	2009-10-01 22:26:14 +0000	[diff] [blame]	178	b = flss(divisor) - 1;
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	179	r = sizeof(DCTELEM) * 8 + b;
				180
				181	fq = ((UDCTELEM2)1 << r) / divisor;
				182	fr = ((UDCTELEM2)1 << r) % divisor;
				183
				184	c = divisor / 2; /* for rounding */
				185
				186	if (fr == 0) { /* divisor is power of two */
				187	/* fq will be one bit too large to fit in DCTELEM, so adjust */
				188	fq >>= 1;
				189	r--;
DRC	d65d99a	2012-01-31 03:39:23 +0000	[diff] [blame]	190	} else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	191	c++;
				192	} else { /* fractional part is > 0.5 */
				193	fq++;
				194	}
				195
				196	dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
				197	dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
				198	dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)82 - r)); /* scale */
				199	dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)8; / shift */
DRC	a49c4e5	2011-02-18 20:50:08 +0000	[diff] [blame]	200
				201	if(r <= 16) return 0;
				202	else return 1;
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	203	}
				204
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	205	#endif
				206
				207
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	208	/*
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	209	* Initialize for a processing pass.
				210	* Verify that all referenced Q-tables are present, and set up
				211	* the divisor table for each one.
				212	* In the current implementation, DCT of all components is done during
				213	* the first pass, even if only some components will be output in the
				214	* first scan. Hence all components should be examined here.
				215	*/
				216
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	217	METHODDEF(void)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	218	start_pass_fdctmgr (j_compress_ptr cinfo)
				219	{
				220	my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
				221	int ci, qtblno, i;
				222	jpeg_component_info *compptr;
				223	JQUANT_TBL * qtbl;
				224	DCTELEM * dtbl;
				225
				226	for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
				227	ci++, compptr++) {
				228	qtblno = compptr->quant_tbl_no;
				229	/* Make sure specified quantization table is present */
				230	if (qtblno < 0 \|\| qtblno >= NUM_QUANT_TBLS \|\|
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	231	cinfo->quant_tbl_ptrs[qtblno] == NULL)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	232	ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
				233	qtbl = cinfo->quant_tbl_ptrs[qtblno];
				234	/* Compute divisors for this quant table */
				235	/* We may do this more than once for same table, but it's not a big deal */
				236	switch (cinfo->dct_method) {
				237	#ifdef DCT_ISLOW_SUPPORTED
				238	case JDCT_ISLOW:
				239	/* For LL&M IDCT method, divisors are equal to raw quantization
				240	* coefficients multiplied by 8 (to counteract scaling).
				241	*/
				242	if (fdct->divisors[qtblno] == NULL) {
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	243	fdct->divisors[qtblno] = (DCTELEM *)
				244	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
DRC	5de454b	2014-05-18 19:04:03 +0000	[diff] [blame]	245	(DCTSIZE2 * 4) * sizeof(DCTELEM));
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	246	}
				247	dtbl = fdct->divisors[qtblno];
				248	for (i = 0; i < DCTSIZE2; i++) {
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	249	#if BITS_IN_JSAMPLE == 8
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	250	if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
				251	&& fdct->quantize == jsimd_quantize)
				252	fdct->quantize = quantize;
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	253	#else
				254	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
				255	#endif
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	256	}
				257	break;
				258	#endif
				259	#ifdef DCT_IFAST_SUPPORTED
				260	case JDCT_IFAST:
				261	{
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	262	/* For AA&N IDCT method, divisors are equal to quantization
				263	* coefficients scaled by scalefactor[row]*scalefactor[col], where
				264	* scalefactor[0] = 1
				265	* scalefactor[k] = cos(kPI/16) sqrt(2) for k=1..7
				266	* We apply a further scale factor of 8.
				267	*/
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	268	#define CONST_BITS 14
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	269	static const INT16 aanscales[DCTSIZE2] = {
				270	/* precomputed values scaled up by 14 bits */
				271	16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
				272	22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
				273	21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
				274	19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
				275	16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
				276	12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
				277	8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
				278	4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
				279	};
				280	SHIFT_TEMPS
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	281
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	282	if (fdct->divisors[qtblno] == NULL) {
				283	fdct->divisors[qtblno] = (DCTELEM *)
				284	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
DRC	5de454b	2014-05-18 19:04:03 +0000	[diff] [blame]	285	(DCTSIZE2 * 4) * sizeof(DCTELEM));
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	286	}
				287	dtbl = fdct->divisors[qtblno];
				288	for (i = 0; i < DCTSIZE2; i++) {
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	289	#if BITS_IN_JSAMPLE == 8
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	290	if(!compute_reciprocal(
				291	DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
				292	(INT32) aanscales[i]),
				293	CONST_BITS-3), &dtbl[i])
				294	&& fdct->quantize == jsimd_quantize)
				295	fdct->quantize = quantize;
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	296	#else
				297	dtbl[i] = (DCTELEM)
				298	DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
				299	(INT32) aanscales[i]),
				300	CONST_BITS-3);
				301	#endif
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	302	}
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	303	}
				304	break;
				305	#endif
				306	#ifdef DCT_FLOAT_SUPPORTED
				307	case JDCT_FLOAT:
				308	{
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	309	/* For float AA&N IDCT method, divisors are equal to quantization
				310	* coefficients scaled by scalefactor[row]*scalefactor[col], where
				311	* scalefactor[0] = 1
				312	* scalefactor[k] = cos(kPI/16) sqrt(2) for k=1..7
				313	* We apply a further scale factor of 8.
				314	* What's actually stored is 1/divisor so that the inner loop can
				315	* use a multiplication rather than a division.
				316	*/
				317	FAST_FLOAT * fdtbl;
				318	int row, col;
				319	static const double aanscalefactor[DCTSIZE] = {
				320	1.0, 1.387039845, 1.306562965, 1.175875602,
				321	1.0, 0.785694958, 0.541196100, 0.275899379
				322	};
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	323
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	324	if (fdct->float_divisors[qtblno] == NULL) {
				325	fdct->float_divisors[qtblno] = (FAST_FLOAT *)
				326	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
DRC	5de454b	2014-05-18 19:04:03 +0000	[diff] [blame]	327	DCTSIZE2 * sizeof(FAST_FLOAT));
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	328	}
				329	fdtbl = fdct->float_divisors[qtblno];
				330	i = 0;
				331	for (row = 0; row < DCTSIZE; row++) {
				332	for (col = 0; col < DCTSIZE; col++) {
				333	fdtbl[i] = (FAST_FLOAT)
				334	(1.0 / (((double) qtbl->quantval[i] *
				335	aanscalefactor[row] * aanscalefactor[col] * 8.0)));
				336	i++;
				337	}
				338	}
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	339	}
				340	break;
				341	#endif
				342	default:
				343	ERREXIT(cinfo, JERR_NOT_COMPILED);
				344	break;
				345	}
				346	}
				347	}
				348
				349
				350	/*
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	351	* Load data into workspace, applying unsigned->signed conversion.
				352	*/
				353
				354	METHODDEF(void)
				355	convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
				356	{
				357	register DCTELEM *workspaceptr;
				358	register JSAMPROW elemptr;
				359	register int elemr;
				360
				361	workspaceptr = workspace;
				362	for (elemr = 0; elemr < DCTSIZE; elemr++) {
				363	elemptr = sample_data[elemr] + start_col;
				364
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	365	#if DCTSIZE == 8 /* unroll the inner loop */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	366	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				367	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				368	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				369	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				370	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				371	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				372	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				373	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				374	#else
				375	{
				376	register int elemc;
				377	for (elemc = DCTSIZE; elemc > 0; elemc--)
				378	workspaceptr++ = GETJSAMPLE(elemptr++) - CENTERJSAMPLE;
				379	}
				380	#endif
				381	}
				382	}
				383
				384
				385	/*
				386	* Quantize/descale the coefficients, and store into coef_blocks[].
				387	*/
				388
				389	METHODDEF(void)
				390	quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
				391	{
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	392	int i;
				393	DCTELEM temp;
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	394	JCOEFPTR output_ptr = coef_block;
				395
				396	#if BITS_IN_JSAMPLE == 8
				397
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	398	UDCTELEM recip, corr, shift;
				399	UDCTELEM2 product;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	400
				401	for (i = 0; i < DCTSIZE2; i++) {
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	402	temp = workspace[i];
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	403	recip = divisors[i + DCTSIZE2 * 0];
				404	corr = divisors[i + DCTSIZE2 * 1];
				405	shift = divisors[i + DCTSIZE2 * 3];
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	406
				407	if (temp < 0) {
				408	temp = -temp;
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	409	product = (UDCTELEM2)(temp + corr) * recip;
				410	product >>= shift + sizeof(DCTELEM)*8;
				411	temp = product;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	412	temp = -temp;
				413	} else {
Pierre Ossman	dedc42e	2009-03-09 13:23:04 +0000	[diff] [blame]	414	product = (UDCTELEM2)(temp + corr) * recip;
				415	product >>= shift + sizeof(DCTELEM)*8;
				416	temp = product;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	417	}
				418	output_ptr[i] = (JCOEF) temp;
				419	}
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	420
				421	#else
				422
				423	register DCTELEM qval;
				424
				425	for (i = 0; i < DCTSIZE2; i++) {
				426	qval = divisors[i];
				427	temp = workspace[i];
				428	/* Divide the coefficient value by qval, ensuring proper rounding.
				429	* Since C does not specify the direction of rounding for negative
				430	* quotients, we have to force the dividend positive for portability.
				431	*
				432	* In most files, at least half of the output values will be zero
				433	* (at default quantization settings, more like three-quarters...)
				434	* so we should ensure that this case is fast. On many machines,
				435	* a comparison is enough cheaper than a divide to make a special test
				436	* a win. Since both inputs will be nonnegative, we need only test
				437	* for a < b to discover whether a/b is 0.
				438	* If your machine's division is fast enough, define FAST_DIVIDE.
				439	*/
				440	#ifdef FAST_DIVIDE
				441	#define DIVIDE_BY(a,b) a /= b
				442	#else
				443	#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0
				444	#endif
				445	if (temp < 0) {
				446	temp = -temp;
				447	temp += qval>>1; /* for rounding */
				448	DIVIDE_BY(temp, qval);
DRC	eca0637	2014-11-06 09:32:38 +0000	[diff] [blame]	449	temp = -temp;
DRC	aee4f72	2014-08-09 23:06:07 +0000	[diff] [blame]	450	} else {
				451	temp += qval>>1; /* for rounding */
				452	DIVIDE_BY(temp, qval);
				453	}
				454	output_ptr[i] = (JCOEF) temp;
				455	}
				456
				457	#endif
				458
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	459	}
				460
				461
				462	/*
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	463	* Perform forward DCT on one or more blocks of a component.
				464	*
				465	* The input samples are taken from the sample_data[] array starting at
				466	* position start_row/start_col, and moving to the right for any additional
Thomas G. Lane	bc79e06	1995-08-02 00:00:00 +0000	[diff] [blame]	467	* blocks. The quantized coefficients are returned in coef_blocks[].
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	468	*/
				469
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	470	METHODDEF(void)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	471	forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	472	JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
				473	JDIMENSION start_row, JDIMENSION start_col,
				474	JDIMENSION num_blocks)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	475	/* This version is used for integer DCT implementations. */
				476	{
				477	/* This routine is heavily used, so it's worth coding it tightly. */
				478	my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	479	DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	480	DCTELEM * workspace;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	481	JDIMENSION bi;
				482
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	483	/* Make sure the compiler doesn't look up these every pass */
				484	forward_DCT_method_ptr do_dct = fdct->dct;
				485	convsamp_method_ptr do_convsamp = fdct->convsamp;
				486	quantize_method_ptr do_quantize = fdct->quantize;
Pierre Ossman	dc5db14	2009-03-13 12:17:26 +0000	[diff] [blame]	487	workspace = fdct->workspace;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	488
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	489	sample_data += start_row; /* fold in the vertical offset once */
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	490
				491	for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
				492	/* Load data into workspace, applying unsigned->signed conversion */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	493	(*do_convsamp) (sample_data, start_col, workspace);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	494
				495	/* Perform the DCT */
				496	(*do_dct) (workspace);
				497
				498	/* Quantize/descale the coefficients, and store into coef_blocks[] */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	499	(*do_quantize) (coef_blocks[bi], divisors, workspace);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	500	}
				501	}
				502
				503
				504	#ifdef DCT_FLOAT_SUPPORTED
				505
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	506
				507	METHODDEF(void)
				508	convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
				509	{
				510	register FAST_FLOAT *workspaceptr;
				511	register JSAMPROW elemptr;
				512	register int elemr;
				513
				514	workspaceptr = workspace;
				515	for (elemr = 0; elemr < DCTSIZE; elemr++) {
				516	elemptr = sample_data[elemr] + start_col;
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	517	#if DCTSIZE == 8 /* unroll the inner loop */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	518	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				519	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				520	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				521	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				522	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				523	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				524	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				525	workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(elemptr++) - CENTERJSAMPLE);
				526	#else
				527	{
				528	register int elemc;
				529	for (elemc = DCTSIZE; elemc > 0; elemc--)
				530	*workspaceptr++ = (FAST_FLOAT)
				531	(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
				532	}
				533	#endif
				534	}
				535	}
				536
				537
				538	METHODDEF(void)
				539	quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
				540	{
				541	register FAST_FLOAT temp;
				542	register int i;
				543	register JCOEFPTR output_ptr = coef_block;
				544
				545	for (i = 0; i < DCTSIZE2; i++) {
				546	/* Apply the quantization and scaling factor */
				547	temp = workspace[i] * divisors[i];
				548
				549	/* Round to nearest integer.
				550	* Since C does not specify the direction of rounding for negative
				551	* quotients, we have to force the dividend positive for portability.
				552	* The maximum coefficient size is +-16K (for 12-bit data), so this
				553	* code should work for either 16-bit or 32-bit ints.
				554	*/
				555	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
				556	}
				557	}
				558
				559
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	560	METHODDEF(void)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	561	forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	562	JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
				563	JDIMENSION start_row, JDIMENSION start_col,
				564	JDIMENSION num_blocks)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	565	/* This version is used for floating-point DCT implementations. */
				566	{
				567	/* This routine is heavily used, so it's worth coding it tightly. */
				568	my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	569	FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	570	FAST_FLOAT * workspace;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	571	JDIMENSION bi;
				572
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	573
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	574	/* Make sure the compiler doesn't look up these every pass */
				575	float_DCT_method_ptr do_dct = fdct->float_dct;
				576	float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
				577	float_quantize_method_ptr do_quantize = fdct->float_quantize;
Pierre Ossman	dc5db14	2009-03-13 12:17:26 +0000	[diff] [blame]	578	workspace = fdct->float_workspace;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	579
DRC	e5eaf37	2014-05-09 18:00:32 +0000	[diff] [blame]	580	sample_data += start_row; /* fold in the vertical offset once */
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	581
				582	for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
				583	/* Load data into workspace, applying unsigned->signed conversion */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	584	(*do_convsamp) (sample_data, start_col, workspace);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	585
				586	/* Perform the DCT */
				587	(*do_dct) (workspace);
				588
				589	/* Quantize/descale the coefficients, and store into coef_blocks[] */
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	590	(*do_quantize) (coef_blocks[bi], divisors, workspace);
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	591	}
				592	}
				593
				594	#endif /* DCT_FLOAT_SUPPORTED */
				595
				596
				597	/*
				598	* Initialize FDCT manager.
				599	*/
				600
Thomas G. Lane	489583f	1996-02-07 00:00:00 +0000	[diff] [blame]	601	GLOBAL(void)
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	602	jinit_forward_dct (j_compress_ptr cinfo)
				603	{
				604	my_fdct_ptr fdct;
				605	int i;
				606
				607	fdct = (my_fdct_ptr)
				608	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
DRC	5de454b	2014-05-18 19:04:03 +0000	[diff] [blame]	609	sizeof(my_fdct_controller));
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	610	cinfo->fdct = (struct jpeg_forward_dct *) fdct;
				611	fdct->pub.start_pass = start_pass_fdctmgr;
				612
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	613	/* First determine the DCT... */
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	614	switch (cinfo->dct_method) {
				615	#ifdef DCT_ISLOW_SUPPORTED
				616	case JDCT_ISLOW:
				617	fdct->pub.forward_DCT = forward_DCT;
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	618	if (jsimd_can_fdct_islow())
				619	fdct->dct = jsimd_fdct_islow;
				620	else
				621	fdct->dct = jpeg_fdct_islow;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	622	break;
				623	#endif
				624	#ifdef DCT_IFAST_SUPPORTED
				625	case JDCT_IFAST:
				626	fdct->pub.forward_DCT = forward_DCT;
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	627	if (jsimd_can_fdct_ifast())
				628	fdct->dct = jsimd_fdct_ifast;
				629	else
				630	fdct->dct = jpeg_fdct_ifast;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	631	break;
				632	#endif
				633	#ifdef DCT_FLOAT_SUPPORTED
				634	case JDCT_FLOAT:
				635	fdct->pub.forward_DCT = forward_DCT_float;
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	636	if (jsimd_can_fdct_float())
				637	fdct->float_dct = jsimd_fdct_float;
				638	else
				639	fdct->float_dct = jpeg_fdct_float;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	640	break;
				641	#endif
				642	default:
				643	ERREXIT(cinfo, JERR_NOT_COMPILED);
				644	break;
				645	}
				646
				647	/* ...then the supporting stages. */
				648	switch (cinfo->dct_method) {
				649	#ifdef DCT_ISLOW_SUPPORTED
				650	case JDCT_ISLOW:
				651	#endif
				652	#ifdef DCT_IFAST_SUPPORTED
				653	case JDCT_IFAST:
				654	#endif
				655	#if defined(DCT_ISLOW_SUPPORTED) \|\| defined(DCT_IFAST_SUPPORTED)
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	656	if (jsimd_can_convsamp())
				657	fdct->convsamp = jsimd_convsamp;
				658	else
				659	fdct->convsamp = convsamp;
				660	if (jsimd_can_quantize())
				661	fdct->quantize = jsimd_quantize;
				662	else
				663	fdct->quantize = quantize;
Pierre Ossman	49dcbfb	2009-03-09 10:37:20 +0000	[diff] [blame]	664	break;
				665	#endif
				666	#ifdef DCT_FLOAT_SUPPORTED
				667	case JDCT_FLOAT:
Pierre Ossman	59a3938	2009-03-09 13:15:56 +0000	[diff] [blame]	668	if (jsimd_can_convsamp_float())
				669	fdct->float_convsamp = jsimd_convsamp_float;
				670	else
				671	fdct->float_convsamp = convsamp_float;
				672	if (jsimd_can_quantize_float())
				673	fdct->float_quantize = jsimd_quantize_float;
				674	else
				675	fdct->float_quantize = quantize_float;
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	676	break;
				677	#endif
				678	default:
				679	ERREXIT(cinfo, JERR_NOT_COMPILED);
				680	break;
				681	}
				682
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	683	/* Allocate workspace memory */
				684	#ifdef DCT_FLOAT_SUPPORTED
				685	if (cinfo->dct_method == JDCT_FLOAT)
				686	fdct->float_workspace = (FAST_FLOAT *)
				687	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
DRC	5de454b	2014-05-18 19:04:03 +0000	[diff] [blame]	688	sizeof(FAST_FLOAT) * DCTSIZE2);
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	689	else
				690	#endif
				691	fdct->workspace = (DCTELEM *)
				692	(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
DRC	5de454b	2014-05-18 19:04:03 +0000	[diff] [blame]	693	sizeof(DCTELEM) * DCTSIZE2);
Pierre Ossman	35c4719	2009-03-09 13:29:37 +0000	[diff] [blame]	694
Thomas G. Lane	36a4ccc	1994-09-24 00:00:00 +0000	[diff] [blame]	695	/* Mark divisor tables unallocated */
				696	for (i = 0; i < NUM_QUANT_TBLS; i++) {
				697	fdct->divisors[i] = NULL;
				698	#ifdef DCT_FLOAT_SUPPORTED
				699	fdct->float_divisors[i] = NULL;
				700	#endif
				701	}
				702	}