Merge the TurboJPEG planar YUV encoding feature from VirtualGL 2.2


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@305 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf86b57..a6c6cd7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -209,6 +209,7 @@
 
 enable_testing()
 add_test(jpegut jpegut)
+add_test(jpegut-yuv jpegut -yuv)
 add_test(cjpeg-int sharedlib/cjpeg -dct int -outfile testoutint.jpg ${CMAKE_SOURCE_DIR}/testorig.ppm)
 add_test(cjpeg-int-cmp ${CMAKE_COMMAND} -E compare_files ${CMAKE_SOURCE_DIR}/testimgint.jpg testoutint.jpg)
 add_test(cjpeg-fast sharedlib/cjpeg -dct fast -opt -outfile testoutfst.jpg ${CMAKE_SOURCE_DIR}/testorig.ppm)
@@ -251,6 +252,7 @@
 add_test(jpegtran-crop-cmp ${CMAKE_COMMAND} -E compare_files ${CMAKE_SOURCE_DIR}/testimgcrop.jpg testoutcrop.jpg)
 
 add_test(jpegut-static jpegut-static)
+add_test(jpegut-static-yuv jpegut-static -yuv)
 add_test(cjpeg-static-int cjpeg-static -dct int -outfile testoutint.jpg ${CMAKE_SOURCE_DIR}/testorig.ppm)
 add_test(cjpeg-static-int-cmp ${CMAKE_COMMAND} -E compare_files ${CMAKE_SOURCE_DIR}/testimgint.jpg testoutint.jpg)
 add_test(cjpeg-static-fast cjpeg-static -dct fast -opt -outfile testoutfst.jpg ${CMAKE_SOURCE_DIR}/testorig.ppm)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index f250c8a..7a7a671 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -21,6 +21,9 @@
 [7] Added arithmetic encoding and decoding support (can be disabled via
 configure or CMake options)
 
+[8] TurboJPEG/OSS can now leverage the SIMD-accelerated color conversion
+routines in libjpeg-turbo to generate planar YUV images from RGB input.
+
 
 Significant changes since 1.0.0
 ===============================
diff --git a/Makefile.am b/Makefile.am
index 74df225..2f7e773 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -124,6 +124,7 @@
 
 test: testclean all
 	./jpegut
+	./jpegut -yuv
 	./cjpeg -dct int -outfile testoutint.jpg $(srcdir)/testorig.ppm
 	cmp $(srcdir)/testimgint.jpg testoutint.jpg
 	./cjpeg -dct fast -opt -outfile testoutfst.jpg $(srcdir)/testorig.ppm
diff --git a/jpegut.c b/jpegut.c
index db46b6d..3344ca8 100644
--- a/jpegut.c
+++ b/jpegut.c
@@ -1,6 +1,6 @@
 /* Copyright (C)2004 Landmark Graphics Corporation
  * Copyright (C)2005 Sun Microsystems, Inc.
- * Copyright (C)2009 D. R. Commander
+ * Copyright (C)2009-2010 D. R. Commander
  *
  * This library is free software and may be redistributed and/or modified under
  * the terms of the wxWindows Library License, Version 3.1 or (at your option)
@@ -18,11 +18,18 @@
 #include <string.h>
 #include "./rrtimer.h"
 #include "./turbojpeg.h"
+#ifndef _WIN32
+ #define stricmp strcasecmp
+#endif
 
 #define _catch(f) {if((f)==-1) {printf("TJPEG: %s\n", tjGetErrorStr());  bailout();}}
 
 const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
 const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
+const int _hsf[NUMSUBOPT]={1, 2, 2, 1};
+const int _vsf[NUMSUBOPT]={1, 1, 2, 1};
+
+int yuv=0;
 
 int exitstatus=0;
 #define bailout() {exitstatus=-1;  goto finally;}
@@ -197,6 +204,126 @@
 	return 1;
 }
 
+#define checkval(v, cv) { \
+	if(v<cv-1 || v>cv+1) { \
+		printf("\nComp. %s at %d,%d should be %d, not %d\n", #v, i, j, cv, v); \
+		retval=0;  goto bailout; \
+	}}
+
+#define checkval0(v) { \
+	if(v>1) { \
+		printf("\nComp. %s at %d,%d should be 0, not %d\n", #v, i, j, v); \
+		retval=0;  goto bailout; \
+	}}
+
+#define checkval255(v) { \
+	if(v<254 && !(v==217 && i==0 && j==21)) { \
+		printf("\nComp. %s at %d,%d should be 255, not %d\n", #v, i, j, v); \
+		retval=0;  goto bailout; \
+	}}
+
+#define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
+
+int checkbufyuv(unsigned char *buf, unsigned long size, int w, int h,
+	int subsamp)
+{
+	int i, j;
+	int hsf=_hsf[subsamp], vsf=_vsf[subsamp];
+	int pw=PAD(w, hsf), ph=PAD(h, vsf);
+	int cw=pw/hsf, ch=ph/vsf;
+	int ypitch=PAD(pw, 4), uvpitch=PAD(cw, 4);
+	int retval=1;
+	unsigned long correctsize=ypitch*ph + (subsamp==TJ_GRAYSCALE? 0:uvpitch*ch*2);
+
+	if(size!=correctsize)
+	{
+		printf("\nIncorrect size %lu.  Should be %lu\n", size, correctsize);
+		retval=0;  goto bailout;
+	}
+
+	for(i=0; i<16; i++)
+	{
+		for(j=0; j<pw; j++)
+		{
+			unsigned char y=buf[ypitch*i+j];
+			if(((i/8)+(j/8))%2==0) checkval255(y)
+			else checkval(y, 76)
+		}
+	}
+	for(i=16; i<ph; i++)
+	{
+		for(j=0; j<pw; j++)
+		{
+			unsigned char y=buf[ypitch*i+j];
+			if(((i/8)+(j/8))%2==0) checkval0(y)
+			else checkval(y, 226)
+		}
+	}
+	if(subsamp!=TJ_GRAYSCALE)
+	{
+		for(i=0; i<16/vsf; i++)
+		{
+			for(j=0; j<cw; j++)
+			{
+				unsigned char u=buf[ypitch*ph + (uvpitch*i+j)],
+					v=buf[ypitch*ph + uvpitch*ch + (uvpitch*i+j)];
+				if(((i*vsf/8)+(j*hsf/8))%2==0)
+				{
+					checkval(u, 128);  checkval(v, 128);
+				}
+				else
+				{
+					checkval(u, 85);  checkval255(v);
+				}
+			}
+		}
+		for(i=16/vsf; i<ch; i++)
+		{
+			for(j=0; j<cw; j++)
+			{
+				unsigned char u=buf[ypitch*ph + (uvpitch*i+j)],
+					v=buf[ypitch*ph + uvpitch*ch + (uvpitch*i+j)];
+				if(((i*vsf/8)+(j*hsf/8))%2==0)
+				{
+					checkval(u, 128);  checkval(v, 128);
+				}
+				else
+				{
+					checkval0(u);  checkval(v, 149);
+				}
+			}
+		}
+	}
+
+	bailout:
+	if(retval==0)
+	{
+		for(i=0; i<ph; i++)
+		{
+			for(j=0; j<pw; j++)
+				printf("%.3d ", buf[ypitch*i+j]);
+			printf("\n");
+		}
+		printf("\n");
+		for(i=0; i<ch; i++)
+		{
+			for(j=0; j<cw; j++)
+				printf("%.3d ", buf[ypitch*ph + (uvpitch*i+j)]);
+			printf("\n");
+		}
+		printf("\n");
+		for(i=0; i<ch; i++)
+		{
+			for(j=0; j<cw; j++)
+				printf("%.3d ", buf[ypitch*ph + uvpitch*ch + (uvpitch*i+j)]);
+			printf("\n");
+		}
+		printf("\n");
+	}
+
+	return retval;
+}
+
 void writejpeg(unsigned char *jpegbuf, unsigned long jpgbufsize, char *filename)
 {
 	FILE *outfile=NULL;
@@ -221,6 +348,8 @@
 	char tempstr[1024];  unsigned char *bmpbuf=NULL;
 	const char *pixformat;  double t;
 
+	if(yuv) flags|=TJ_YUV;
+
 	if(flags&TJ_BGR)
 	{
 		if(ps==3) pixformat="BGR";
@@ -232,8 +361,12 @@
 		else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB";  else pixformat="RGBA";}
 	}
 	if(ps==1) pixformat="Grayscale";
-	printf("%s %s -> %s Q%d ... ", pixformat,
-		(flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ", _subnamel[subsamp], qual);
+	if(yuv)
+		printf("%s %s -> %s YUV ... ", pixformat,
+			(flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ", _subnamel[subsamp]);
+	else
+		printf("%s %s -> %s Q%d ... ", pixformat,
+			(flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ", _subnamel[subsamp], qual);
 
 	if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
 	{
@@ -246,10 +379,20 @@
 	_catch(tjCompress(hnd, bmpbuf, w, 0, h, ps, jpegbuf, size, subsamp, qual, flags));
 	t=rrtime()-t;
 
-	sprintf(tempstr, "%s_enc_%s_%s_%sQ%d.jpg", basefilename, pixformat,
-		(flags&TJ_BOTTOMUP)? "BU":"TD", _subnames[subsamp], qual);
+	if(yuv)
+		sprintf(tempstr, "%s_enc_%s_%s_%s.yuv", basefilename, pixformat,
+			(flags&TJ_BOTTOMUP)? "BU":"TD", _subnames[subsamp]);
+	else
+		sprintf(tempstr, "%s_enc_%s_%s_%sQ%d.jpg", basefilename, pixformat,
+			(flags&TJ_BOTTOMUP)? "BU":"TD", _subnames[subsamp], qual);
 	writejpeg(jpegbuf, *size, tempstr);
-	printf("Done.  %f ms\n  Result in %s\n", t*1000., tempstr);
+	if(yuv)
+	{
+		if(checkbufyuv(jpegbuf, *size, w, h, subsamp)) printf("Passed.");
+		else printf("FAILED!");
+	}
+	else printf("Done.");
+	printf("  %f ms\n  Result in %s\n", t*1000., tempstr);
 
 	finally:
 	if(bmpbuf) free(bmpbuf);
@@ -261,6 +404,8 @@
 	unsigned char *bmpbuf=NULL;
 	const char *pixformat;  int _w=0, _h=0;  double t;
 
+	if(yuv) return;
+
 	if(flags&TJ_BGR)
 	{
 		if(ps==3) pixformat="BGR";
@@ -404,8 +549,16 @@
 
 int main(int argc, char *argv[])
 {
+	if(argc>1 && !stricmp(argv[1], "-yuv")) yuv=1;
 	dotest(35, 41, 3, TJ_444, "test");
 	dotest(35, 41, 4, TJ_444, "test");
+	if(yuv)
+	{
+		dotest(35, 41, 3, TJ_422, "test");
+		dotest(35, 41, 4, TJ_422, "test");
+		dotest(35, 41, 3, TJ_420, "test");
+		dotest(35, 41, 4, TJ_420, "test");
+	}
 	dotest(35, 41, 1, TJ_GRAYSCALE, "test");
 	dotest(35, 41, 3, TJ_GRAYSCALE, "test");
 	dotest(35, 41, 4, TJ_GRAYSCALE, "test");
diff --git a/jpgtest.cxx b/jpgtest.cxx
index 79de5cf..96e5173 100644
--- a/jpgtest.cxx
+++ b/jpgtest.cxx
@@ -30,7 +30,7 @@
 #define _throwbmp(m) _throw(m, bmpgeterr())
 
 int forcemmx=0, forcesse=0, forcesse2=0, forcesse3=0, fastupsample=0,
-	decomponly=0;
+	decomponly=0, yuv=0;
 const int _ps[BMPPIXELFORMATS]={3, 4, 3, 4, 4, 4};
 const int _flags[BMPPIXELFORMATS]={0, 0, TJ_BGR, TJ_BGR,
 	TJ_BGR|TJ_ALPHAFIRST, TJ_ALPHAFIRST};
@@ -76,12 +76,20 @@
 
 	flags |= _flags[pf];
 	if(bu) flags |= TJ_BOTTOMUP;
+	if(yuv) flags |= TJ_YUV;
 
 	if((rgbbuf=(unsigned char *)malloc(pitch*h)) == NULL)
 		_throwunix("allocating image buffer");
 
-	if(!quiet) printf("\n>>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", _pfname[pf],
-		bu?"Bottom-up":"Top-down", _subnamel[jpegsub], qual);
+	if(!quiet)
+	{
+		if(yuv)
+			printf("\n>>>>>  %s (%s) <--> YUV %s  <<<<<\n", _pfname[pf],
+				bu?"Bottom-up":"Top-down", _subnamel[jpegsub]);
+		else
+			printf("\n>>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", _pfname[pf],
+				bu?"Bottom-up":"Top-down", _subnamel[jpegsub], qual);
+	}
 	if(dotile) {tilesizex=tilesizey=4;}  else {tilesizex=w;  tilesizey=h;}
 
 	do
@@ -155,7 +163,10 @@
 		}
 		if(tilesizex==w && tilesizey==h)
 		{
-			sprintf(tempstr, "%s_%sQ%d.jpg", filename, _subnames[jpegsub], qual);
+			if(yuv)
+				sprintf(tempstr, "%s_%s.yuv", filename, _subnames[jpegsub]);
+			else
+				sprintf(tempstr, "%s_%sQ%d.jpg", filename, _subnames[jpegsub], qual);
 			if((outfile=fopen(tempstr, "wb"))==NULL)
 				_throwunix("opening reference image");
 			if(fwrite(jpegbuf[0], jpgbufsize, 1, outfile)!=1)
@@ -163,6 +174,7 @@
 			fclose(outfile);
 			if(!quiet) printf("Reference image written to %s\n", tempstr);
 		}
+		if(yuv) goto bailout;
 
 		// Decompression test
 		memset(rgbbuf, 127, pitch*h);  // Grey image means decompressor did nothing
@@ -371,7 +383,9 @@
 	printf("       Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
 	printf("       YUV decoding in libjpeg decompressor\n\n");
 	printf("       [-quiet]\n");
-	printf("       Output in tabular rather than verbose format\n\n");
+	printf("       Output in tabular rather than verbose format\n");
+	printf("       [-yuv]\n");
+	printf("       Encode RGB input as planar YUV rather than compressing as JPEG\n\n");
 	printf("       NOTE: If the quality is specified as a range, i.e. 90-100, a separate\n");
 	printf("       test will be performed for all quality values in the range.\n");
 	exit(1);
@@ -441,6 +455,11 @@
 				printf("Using fast upsampling code\n");
 				fastupsample=1;
 			}
+			if(!stricmp(argv[i], "-yuv"))
+			{
+				printf("Testing YUV planar encoding\n");
+				yuv=1;
+			}
 			if(!stricmp(argv[i], "-rgb")) pf=BMP_RGB;
 			if(!stricmp(argv[i], "-rgba")) pf=BMP_RGBA;
 			if(!stricmp(argv[i], "-bgr")) pf=BMP_BGR;
diff --git a/turbojpeg.h b/turbojpeg.h
index e382e71..b4bbd2d 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -53,6 +53,15 @@
 #define TJ_FASTUPSAMPLE  256
   /* Use fast, inaccurate 4:2:2 and 4:2:0 YUV upsampling routines
      (libjpeg version only) */
+#define TJ_YUV           512
+  /* Use the TurboJPEG YUV encoder to produce a planar YUV image that is
+     suitable for X Video.  Specifically, if either the width or the height is
+     subsampled, then that dimension is padded to 2 in the output image.  Also,
+     each line of each plane in the output image is padded to 4 bytes.
+     Although this will work with any subsampling option, it is really only
+     useful in combination with TJ_420, which produces an image compatible
+     with the I420 format. */
+
 typedef void* tjhandle;
 
 #define TJPAD(p) (((p)+3)&(~3))
diff --git a/turbojpegl.c b/turbojpegl.c
index 03833c5..1d719cb 100644
--- a/turbojpegl.c
+++ b/turbojpegl.c
@@ -18,10 +18,35 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#define JPEG_INTERNALS
 #include <jpeglib.h>
 #include <jerror.h>
 #include <setjmp.h>
 #include "./turbojpeg.h"
+#ifdef sun
+#include <malloc.h>
+#endif
+
+void *__memalign(size_t boundary, size_t size)
+{
+	#if defined(_WIN32) || defined(__APPLE__)
+	return malloc(size);
+	#else
+	#ifdef sun
+	return memalign(boundary, size);
+	#else
+	void *ptr=NULL;
+	posix_memalign(&ptr, boundary, size);
+	return ptr;
+	#endif
+	#endif
+}
+
+#ifndef min
+ #define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
 
 
 // Error handling
@@ -117,9 +142,18 @@
 	int jpegsub, int qual, int flags)
 {
 	int i;  JSAMPROW *row_pointer=NULL;
+	JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
+	JSAMPROW *tmpbuf[MAX_COMPONENTS], *tmpbuf2[MAX_COMPONENTS];
+	JSAMPROW *outbuf[MAX_COMPONENTS];
 
 	checkhandle(h);
 
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		tmpbuf[i]=NULL;  _tmpbuf[i]=NULL;
+		tmpbuf2[i]=NULL;  _tmpbuf2[i]=NULL;  outbuf[i]=NULL;
+	}
+
 	if(srcbuf==NULL || width<=0 || pitch<0 || height<=0
 		|| dstbuf==NULL || size==NULL
 		|| jpegsub<0 || jpegsub>=NUMSUBOPT || qual<0 || qual>100)
@@ -158,6 +192,14 @@
 	if(setjmp(j->jerr.jb))
 	{  // this will execute if LIBJPEG has an error
 		if(row_pointer) free(row_pointer);
+		for(i=0; i<MAX_COMPONENTS; i++)
+		{
+			if(tmpbuf[i]!=NULL) free(tmpbuf[i]);
+			if(_tmpbuf[i]!=NULL) free(_tmpbuf[i]);
+			if(tmpbuf2[i]!=NULL) free(tmpbuf2[i]);
+			if(_tmpbuf2[i]!=NULL) free(_tmpbuf2[i]);
+			if(outbuf[i]!=NULL) free(outbuf[i]);
+		}
 		return -1;
 	}
 
@@ -180,24 +222,105 @@
 	j->jdms.next_output_byte = dstbuf;
 	j->jdms.free_in_buffer = TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height);
 
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
-		_throw("Memory allocation failed in tjInitCompress()");
-	for(i=0; i<height; i++)
-	{
-		if(flags&TJ_BOTTOMUP) row_pointer[i]= &srcbuf[(height-i-1)*pitch];
-		else row_pointer[i]= &srcbuf[i*pitch];
-	}
 	jpeg_start_compress(&j->cinfo, TRUE);
-	while(j->cinfo.next_scanline<j->cinfo.image_height)
+	if(flags&TJ_YUV)
 	{
-		jpeg_write_scanlines(&j->cinfo, &row_pointer[j->cinfo.next_scanline],
-			j->cinfo.image_height-j->cinfo.next_scanline);
+		j_compress_ptr cinfo=&j->cinfo;
+		int row;
+		int pw=PAD(width, cinfo->max_h_samp_factor);
+		int ph=PAD(height, cinfo->max_v_samp_factor);
+		int cw[MAX_COMPONENTS], ch[MAX_COMPONENTS];
+		jpeg_component_info *compptr;
+		JSAMPLE *ptr=dstbuf;  unsigned long yuvsize=0;
+
+		if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph))==NULL)
+			_throw("Memory allocation failed in tjCompress()");
+		for(i=0; i<height; i++)
+		{
+			if(flags&TJ_BOTTOMUP) row_pointer[i]= &srcbuf[(height-i-1)*pitch];
+			else row_pointer[i]= &srcbuf[i*pitch];
+		}
+		if(height<ph)
+			for(i=height; i<ph; i++) row_pointer[i]=row_pointer[height-1];
+
+		for(i=0; i<cinfo->num_components; i++)
+		{
+			compptr=&cinfo->comp_info[i];
+			_tmpbuf[i]=(JSAMPLE *)__memalign(16,
+				PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
+					/compptr->h_samp_factor, 16) * cinfo->max_v_samp_factor);
+			if(!_tmpbuf[i]) _throw("Memory allocation failure");
+			tmpbuf[i]=(JSAMPROW *)__memalign(16,
+				sizeof(JSAMPROW)*cinfo->max_v_samp_factor);
+			if(!tmpbuf[i]) _throw("Memory allocation failure");
+			for(row=0; row<cinfo->max_v_samp_factor; row++)
+				tmpbuf[i][row]=&_tmpbuf[i][
+					PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
+						/compptr->h_samp_factor, 16) * row];
+			_tmpbuf2[i]=(JSAMPLE *)__memalign(16,
+				PAD(compptr->width_in_blocks*DCTSIZE, 16) * compptr->v_samp_factor);
+			if(!_tmpbuf2[i]) _throw("Memory allocation failure");
+			tmpbuf2[i]=(JSAMPROW *)__memalign(16,
+				sizeof(JSAMPROW)*compptr->v_samp_factor);
+			if(!tmpbuf2[i]) _throw("Memory allocation failure");
+			for(row=0; row<compptr->v_samp_factor; row++)
+				tmpbuf2[i][row]=&_tmpbuf2[i][
+					PAD(compptr->width_in_blocks*DCTSIZE, 16) * row];
+			cw[i]=pw*compptr->h_samp_factor/cinfo->max_h_samp_factor;
+			ch[i]=ph*compptr->v_samp_factor/cinfo->max_v_samp_factor;
+			outbuf[i]=(JSAMPROW *)__memalign(16, sizeof(JSAMPROW)*ch[i]);
+			if(!outbuf[i]) _throw("Memory allocation failure");
+			for(row=0; row<ch[i]; row++)
+			{
+				outbuf[i][row]=ptr;
+				ptr+=PAD(cw[i], 4);
+			}
+		}
+		yuvsize=(unsigned long)(ptr-dstbuf);
+
+		for(row=0; row<ph; row+=cinfo->max_v_samp_factor)
+		{
+			(*cinfo->cconvert->color_convert)(cinfo, &row_pointer[row], tmpbuf,
+				0, cinfo->max_v_samp_factor);
+			(cinfo->downsample->downsample)(cinfo, tmpbuf, 0, tmpbuf2, 0);
+			for(i=0, compptr=cinfo->comp_info; i<cinfo->num_components;
+				i++, compptr++)
+				jcopy_sample_rows(tmpbuf2[i], 0, outbuf[i],
+					row*compptr->v_samp_factor/cinfo->max_v_samp_factor,
+					compptr->v_samp_factor, cw[i]);
+		}
+		*size=yuvsize;
+		cinfo->next_scanline+=height;
+	}
+	else
+	{
+		if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
+			_throw("Memory allocation failed in tjCompress()");
+		for(i=0; i<height; i++)
+		{
+			if(flags&TJ_BOTTOMUP) row_pointer[i]= &srcbuf[(height-i-1)*pitch];
+			else row_pointer[i]= &srcbuf[i*pitch];
+		}
+		while(j->cinfo.next_scanline<j->cinfo.image_height)
+		{
+			jpeg_write_scanlines(&j->cinfo, &row_pointer[j->cinfo.next_scanline],
+				j->cinfo.image_height-j->cinfo.next_scanline);
+		}
 	}
 	jpeg_finish_compress(&j->cinfo);
-	*size=TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height)
-		-(unsigned long)(j->jdms.free_in_buffer);
+	if(!(flags&TJ_YUV))
+		*size=TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height)
+			-(unsigned long)(j->jdms.free_in_buffer);
 
 	if(row_pointer) free(row_pointer);
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		if(tmpbuf[i]!=NULL) free(tmpbuf[i]);
+		if(_tmpbuf[i]!=NULL) free(_tmpbuf[i]);
+		if(tmpbuf2[i]!=NULL) free(tmpbuf2[i]);
+		if(_tmpbuf2[i]!=NULL) free(_tmpbuf2[i]);
+		if(outbuf[i]!=NULL) free(outbuf[i]);
+	}
 	return 0;
 }
 
@@ -334,6 +457,7 @@
 	#else
 	#error "TurboJPEG requires JPEG colorspace extensions"
 	#endif
+
 	if(flags&TJ_FASTUPSAMPLE) j->dinfo.do_fancy_upsampling=FALSE;
 
 	jpeg_start_decompress(&j->dinfo);