In the Windows installer packages, place a duplicate copy of turbojpeg.dll in c:\libjpeg-turbo[-gcc][64]\bin.  This is mainly to give installers an easy way to find the DLL for the purposes of bundling it.  Specifically, this was necessary for TurboVNC, becuase 32-bit CMake running on 64-bit Windows cannot ever access the "real" c:\windows\system32 directory.


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1027 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1169391..495ac2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 cmake_minimum_required(VERSION 2.6)
 
 project(libjpeg-turbo C)
-set(VERSION 1.3.1)
+set(VERSION 1.3.80)
 
 if(MINGW OR CYGWIN)
   execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
@@ -314,12 +314,15 @@
 if(WITH_JAVA)
 add_test(TJUnitTest ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest)
 add_test(TJUnitTest-yuv ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -yuv)
+add_test(TJUnitTest-yuv-nopad ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -yuv -noyuvpad)
 add_test(TJUnitTest-bi ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -bi)
 add_test(TJUnitTest-bi-yuv ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -bi -yuv)
+add_test(TJUnitTest-bi-yuv-nopad ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -bi -yuv -noyuvpad)
 endif()
 add_test(tjunittest tjunittest)
 add_test(tjunittest-alloc tjunittest -alloc)
 add_test(tjunittest-yuv tjunittest -yuv)
+add_test(tjunittest-yuv-nopad tjunittest -yuv -noyuvpad)
 add_test(cjpeg-int sharedlib/cjpeg -dct int -outfile testoutint.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
 add_test(cjpeg-int-cmp ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_INT} -DFILE=testoutint.jpg -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
 add_test(cjpeg-fast sharedlib/cjpeg -dct fast -opt -outfile testoutfst.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
@@ -375,6 +378,7 @@
 add_test(tjunittest-static tjunittest-static)
 add_test(tjunittest-static-alloc tjunittest-static -alloc)
 add_test(tjunittest-static-yuv tjunittest-static -yuv)
+add_test(tjunittest-static-yuv-nopad tjunittest-static -yuv -noyuvpad)
 add_test(cjpeg-static-int cjpeg-static -dct int -outfile testoutint.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
 add_test(cjpeg-static-int-cmp ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_INT} -DFILE=testoutint.jpg -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
 add_test(cjpeg-static-fast cjpeg-static -dct fast -opt -outfile testoutfst.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 68552bf..f10aa4d 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,3 +1,26 @@
+1.4 pre-beta
+============
+
+[1] The TurboJPEG API can now be used to generate YUV images with an arbitrary
+line padding (previously, it only supported 4-byte padding, which was
+compatible with X Video.)  Also, the decompress-to-YUV function has been
+extended to support image scaling.
+
+[2] Added SIMD acceleration for performing color conversion, downsampling,
+and upsampling on DSPr2-capable MIPS platforms.  This speeds up the compression
+of full-color JPEGs by 6-18% on such platforms and decompression by 3-12%.
+
+[3] Added support for 4:1:1 subsampling to the TurboJPEG API.  This is mainly
+included for compatibility, since 4:1:1 is not fully accelerated in
+libjpeg-turbo and has no significant advantages relative to 4:2:0.
+
+[4] Added support for CMYK images to the TurboJPEG API.  This feature allows
+CMYK source images to be compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be
+decompressed to CMYK destination images.  Conversion between CMYK and RGB
+images is not supported.  Such conversion requires a color management system
+and is out of scope for a codec library.
+
+
 1.3.1
 =====
 
diff --git a/Makefile.am b/Makefile.am
index ade5034..6096049 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -4,7 +4,7 @@
 
 if WITH_TURBOJPEG
 lib_LTLIBRARIES += libturbojpeg.la
-libturbojpeg_la_LDFLAGS = -version-info 0:0 -no-undefined
+libturbojpeg_la_LDFLAGS = -version-info 1:0:1 -no-undefined
 include_HEADERS += turbojpeg.h
 endif
 
@@ -200,11 +200,14 @@
 	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest
 	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -bi
 	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -noyuvpad
 	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -bi
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -bi -noyuvpad
 endif
 	./tjunittest
 	./tjunittest -alloc
 	./tjunittest -yuv
+	./tjunittest -yuv -noyuvpad
 endif
 	./cjpeg -dct int -outfile testoutint.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_INT) testoutint.jpg
@@ -311,6 +314,11 @@
 	rm -f *_440_*.ppm
 	rm -f *_440_*.jpg
 	rm -f *_440.yuv
+	rm -f *_411_*.bmp
+	rm -f *_411_*.png
+	rm -f *_411_*.ppm
+	rm -f *_411_*.jpg
+	rm -f *_411.yuv
 
 
 tjtest:
diff --git a/acinclude.m4 b/acinclude.m4
index ea8ec52..8031136 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -180,3 +180,33 @@
     $2
   fi
 ])
+
+# AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE
+# --------------------------
+# Test whether the assembler is suitable and supports MIPS instructions
+AC_DEFUN([AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE],[
+  have_mips_dspr2=no
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="$CCASFLAGS -mdspr2"
+
+  AC_COMPILE_IFELSE([[
+
+  int main ()
+  {
+    int c = 0, a = 0, b = 0;
+    __asm__ __volatile__ (
+        "precr.qb.ph %[c], %[a], %[b]          \n\t"
+        : [c] "=r" (c)
+        : [a] "r" (a), [b] "r" (b)
+    );
+    return c;
+  }
+  ]], have_mips_dspr2=yes)
+  CFLAGS=$ac_save_CFLAGS
+
+  if test "x$have_mips_dspr2" = "xyes" ; then
+    $1
+  else
+    $2
+  fi
+])
diff --git a/configure.ac b/configure.ac
index e7435aa..0a51c1a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.56])
-AC_INIT([libjpeg-turbo], [1.3.1])
+AC_INIT([libjpeg-turbo], [1.3.80])
 BUILD=`date +%Y%m%d`
 
 AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
@@ -423,6 +423,16 @@
          with_simd=no
          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])])
       ;;
+    mipsel*)
+      AC_MSG_RESULT([yes (mipsel)])
+      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
+      AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE(
+        [AC_MSG_RESULT([yes])
+         simd_arch=mipsel],
+        [AC_MSG_RESULT([no])
+         with_simd=no
+         AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])])
+      ;;
     *)
       AC_MSG_RESULT([no ("$host_cpu")])
       AC_MSG_WARN([SIMD support not available for this CPU.  Performance will suffer.])
@@ -442,6 +452,7 @@
 AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
 AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
 AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
+AM_CONDITIONAL([SIMD_MIPSEL], [test "x$simd_arch" = "xmipsel"])
 AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
 AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
 
diff --git a/doc/html/annotated.html b/doc/html/annotated.html
index 1e3fbd0..f928720 100644
--- a/doc/html/annotated.html
+++ b/doc/html/annotated.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/classes.html b/doc/html/classes.html
index 4722b14..ad625f1 100644
--- a/doc/html/classes.html
+++ b/doc/html/classes.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/functions.html b/doc/html/functions.html
index 7af0d8e..55ccba0 100644
--- a/doc/html/functions.html
+++ b/doc/html/functions.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/functions_vars.html b/doc/html/functions_vars.html
index e6a6f72..cdc5560 100644
--- a/doc/html/functions_vars.html
+++ b/doc/html/functions_vars.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/group___turbo_j_p_e_g.html b/doc/html/group___turbo_j_p_e_g.html
index 7a72821..c1df193 100644
--- a/doc/html/group___turbo_j_p_e_g.html
+++ b/doc/html/group___turbo_j_p_e_g.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
@@ -109,6 +109,9 @@
 <tr class="memitem:ga7010a4402f54a45ba822ad8675a4655e"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e">TJ_NUMPF</a></td></tr>
 <tr class="memdesc:ga7010a4402f54a45ba822ad8675a4655e"><td class="mdescLeft">&#160;</td><td class="mdescRight">The number of pixel formats.  <a href="#ga7010a4402f54a45ba822ad8675a4655e">More...</a><br/></td></tr>
 <tr class="separator:ga7010a4402f54a45ba822ad8675a4655e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga39f57a6fb02d9cf32e7b6890099b5a71"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga39f57a6fb02d9cf32e7b6890099b5a71">TJ_NUMCS</a></td></tr>
+<tr class="memdesc:ga39f57a6fb02d9cf32e7b6890099b5a71"><td class="mdescLeft">&#160;</td><td class="mdescRight">The number of JPEG colorspaces.  <a href="#ga39f57a6fb02d9cf32e7b6890099b5a71">More...</a><br/></td></tr>
+<tr class="separator:ga39f57a6fb02d9cf32e7b6890099b5a71"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga72ecf4ebe6eb702d3c6f5ca27455e1ec"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">TJFLAG_BOTTOMUP</a></td></tr>
 <tr class="memdesc:ga72ecf4ebe6eb702d3c6f5ca27455e1ec"><td class="mdescLeft">&#160;</td><td class="mdescRight">The uncompressed source/destination image is stored in bottom-up (Windows, OpenGL) order, not top-down (X11) order.  <a href="#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">More...</a><br/></td></tr>
 <tr class="separator:ga72ecf4ebe6eb702d3c6f5ca27455e1ec"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -178,7 +181,8 @@
 <a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a63085dbf683cfe39e513cdb6343e3737">TJSAMP_420</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a3f1c9504842ddc7a48d0f690754b6248">TJSAMP_GRAY</a>, 
 <br/>
-&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974">TJSAMP_440</a>
+&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974">TJSAMP_440</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2">TJSAMP_411</a>
 <br/>
  }</td></tr>
 <tr class="memdesc:ga1d047060ea80bb9820d540bb928e9074"><td class="mdescLeft">&#160;</td><td class="mdescRight">Chrominance subsampling options.  <a href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">More...</a><br/></td></tr>
@@ -196,11 +200,23 @@
 <br/>
 &#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aac037ff1845cf9b74bb81a3659c2b9fb4">TJPF_BGRA</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa1ba1a7f1631dbeaa49a0a85fc4a40081">TJPF_ABGR</a>, 
-<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aae8f846ed9d9de99b6e1dfe448848765c">TJPF_ARGB</a>
+<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aae8f846ed9d9de99b6e1dfe448848765c">TJPF_ARGB</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b">TJPF_CMYK</a>
 <br/>
  }</td></tr>
 <tr class="memdesc:gac916144e26c3817ac514e64ae5d12e2a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Pixel formats.  <a href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">More...</a><br/></td></tr>
 <tr class="separator:gac916144e26c3817ac514e64ae5d12e2a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga4f83ad3368e0e29d1957be0efa7c3720"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">TJCS</a> { <br/>
+&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555">TJCS_RGB</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75">TJCS_YCbCr</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a">TJCS_GRAY</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53">TJCS_CMYK</a>, 
+<br/>
+&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e">TJCS_YCCK</a>
+<br/>
+ }</td></tr>
+<tr class="memdesc:ga4f83ad3368e0e29d1957be0efa7c3720"><td class="mdescLeft">&#160;</td><td class="mdescRight">JPEG colorspaces.  <a href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">More...</a><br/></td></tr>
+<tr class="separator:ga4f83ad3368e0e29d1957be0efa7c3720"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga2de531af4e7e6c4f124908376b354866"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866">TJXOP</a> { <br/>
 &#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aad88c0366cd3f7d0eac9d7a3fa1c2c27">TJXOP_NONE</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aa0df69776caa30f0fa28e26332d311ce">TJXOP_HFLIP</a>, 
@@ -222,32 +238,32 @@
 <tr class="memdesc:ga3d10c47fbe4a2489a2b30c931551d01a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a TurboJPEG compressor instance.  <a href="#ga3d10c47fbe4a2489a2b30c931551d01a">More...</a><br/></td></tr>
 <tr class="separator:ga3d10c47fbe4a2489a2b30c931551d01a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaba62b7a98f960839b588579898495cf2"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2">tjCompress2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags)</td></tr>
-<tr class="memdesc:gaba62b7a98f960839b588579898495cf2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress an RGB or grayscale image into a JPEG image.  <a href="#gaba62b7a98f960839b588579898495cf2">More...</a><br/></td></tr>
+<tr class="memdesc:gaba62b7a98f960839b588579898495cf2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress an RGB, grayscale, or CMYK image into a JPEG image.  <a href="#gaba62b7a98f960839b588579898495cf2">More...</a><br/></td></tr>
 <tr class="separator:gaba62b7a98f960839b588579898495cf2"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned long DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b">tjBufSize</a> (int width, int height, int jpegSubsamp)</td></tr>
 <tr class="memdesc:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="mdescLeft">&#160;</td><td class="mdescRight">The maximum size of the buffer (in bytes) required to hold a JPEG image with the given parameters.  <a href="#gaccc5bca7f12fcdcc302e6e1c6d4b311b">More...</a><br/></td></tr>
 <tr class="separator:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga9d0cb06fd5052d21b6f2b382db8b219c"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned long DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c">tjBufSizeYUV</a> (int width, int height, int subsamp)</td></tr>
-<tr class="memdesc:ga9d0cb06fd5052d21b6f2b382db8b219c"><td class="mdescLeft">&#160;</td><td class="mdescRight">The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters.  <a href="#ga9d0cb06fd5052d21b6f2b382db8b219c">More...</a><br/></td></tr>
-<tr class="separator:ga9d0cb06fd5052d21b6f2b382db8b219c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga0fa4e7b1943687c6a0c0304529c55d35"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0fa4e7b1943687c6a0c0304529c55d35">tjEncodeYUV2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf, int subsamp, int flags)</td></tr>
-<tr class="memdesc:ga0fa4e7b1943687c6a0c0304529c55d35"><td class="mdescLeft">&#160;</td><td class="mdescRight">Encode an RGB or grayscale image into a YUV planar image.  <a href="#ga0fa4e7b1943687c6a0c0304529c55d35">More...</a><br/></td></tr>
-<tr class="separator:ga0fa4e7b1943687c6a0c0304529c55d35"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gaf451664a62c1f6c7cc5a6401f32908c9"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned long DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9">tjBufSizeYUV2</a> (int width, int pad, int height, int subsamp)</td></tr>
+<tr class="memdesc:gaf451664a62c1f6c7cc5a6401f32908c9"><td class="mdescLeft">&#160;</td><td class="mdescRight">The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters.  <a href="#gaf451664a62c1f6c7cc5a6401f32908c9">More...</a><br/></td></tr>
+<tr class="separator:gaf451664a62c1f6c7cc5a6401f32908c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360">tjEncodeYUV3</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags)</td></tr>
+<tr class="memdesc:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="mdescLeft">&#160;</td><td class="mdescRight">Encode an RGB or grayscale image into a YUV planar image.  <a href="#ga0a5ffbf7cb58a5b6a8201114fe889360">More...</a><br/></td></tr>
+<tr class="separator:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gae5408179d041e2a2f7199c8283cf649e"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gae5408179d041e2a2f7199c8283cf649e">tjInitDecompress</a> (void)</td></tr>
 <tr class="memdesc:gae5408179d041e2a2f7199c8283cf649e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a TurboJPEG decompressor instance.  <a href="#gae5408179d041e2a2f7199c8283cf649e">More...</a><br/></td></tr>
 <tr class="separator:gae5408179d041e2a2f7199c8283cf649e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gac5675fceb7997b385516cdffdb34e6aa"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gac5675fceb7997b385516cdffdb34e6aa">tjDecompressHeader2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height, int *jpegSubsamp)</td></tr>
-<tr class="memdesc:gac5675fceb7997b385516cdffdb34e6aa"><td class="mdescLeft">&#160;</td><td class="mdescRight">Retrieve information about a JPEG image without decompressing it.  <a href="#gac5675fceb7997b385516cdffdb34e6aa">More...</a><br/></td></tr>
-<tr class="separator:gac5675fceb7997b385516cdffdb34e6aa"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gacd0fac3af74b3511d39b4781b7103086"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086">tjDecompressHeader3</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height, int *jpegSubsamp, int *jpegColorspace)</td></tr>
+<tr class="memdesc:gacd0fac3af74b3511d39b4781b7103086"><td class="mdescLeft">&#160;</td><td class="mdescRight">Retrieve information about a JPEG image without decompressing it.  <a href="#gacd0fac3af74b3511d39b4781b7103086">More...</a><br/></td></tr>
+<tr class="separator:gacd0fac3af74b3511d39b4781b7103086"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga6449044b9af402999ccf52f401333be8"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="structtjscalingfactor.html">tjscalingfactor</a> *DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8">tjGetScalingFactors</a> (int *numscalingfactors)</td></tr>
 <tr class="memdesc:ga6449044b9af402999ccf52f401333be8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of TurboJPEG supports.  <a href="#ga6449044b9af402999ccf52f401333be8">More...</a><br/></td></tr>
 <tr class="separator:ga6449044b9af402999ccf52f401333be8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gada69cc6443d1bb493b40f1626259e5e9"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9">tjDecompress2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
-<tr class="memdesc:gada69cc6443d1bb493b40f1626259e5e9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to an RGB or grayscale image.  <a href="#gada69cc6443d1bb493b40f1626259e5e9">More...</a><br/></td></tr>
+<tr class="memdesc:gada69cc6443d1bb493b40f1626259e5e9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to an RGB, grayscale, or CMYK image.  <a href="#gada69cc6443d1bb493b40f1626259e5e9">More...</a><br/></td></tr>
 <tr class="separator:gada69cc6443d1bb493b40f1626259e5e9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gad7810af095624a4016e72957a50f77d8"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gad7810af095624a4016e72957a50f77d8">tjDecompressToYUV</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int flags)</td></tr>
-<tr class="memdesc:gad7810af095624a4016e72957a50f77d8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to a YUV planar image.  <a href="#gad7810af095624a4016e72957a50f77d8">More...</a><br/></td></tr>
-<tr class="separator:gad7810af095624a4016e72957a50f77d8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07">tjDecompressToYUV2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int width, int pad, int height, int flags)</td></tr>
+<tr class="memdesc:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to a YUV planar image.  <a href="#ga7c08b340ad7f8e85d407bd9e81d44d07">More...</a><br/></td></tr>
+<tr class="separator:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga3155b775bfbac9dbba869b95a0367902"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga3155b775bfbac9dbba869b95a0367902">tjInitTransform</a> (void)</td></tr>
 <tr class="memdesc:ga3155b775bfbac9dbba869b95a0367902"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a new TurboJPEG transformer instance.  <a href="#ga3155b775bfbac9dbba869b95a0367902">More...</a><br/></td></tr>
 <tr class="separator:ga3155b775bfbac9dbba869b95a0367902"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -292,6 +308,20 @@
 <p>TurboJPEG API. </p>
 <p>This API provides an interface for generating, decoding, and transforming planar YUV and JPEG images in memory. </p>
 <h2 class="groupheader">Macro Definition Documentation</h2>
+<a class="anchor" id="ga39f57a6fb02d9cf32e7b6890099b5a71"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define TJ_NUMCS</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>The number of JPEG colorspaces. </p>
+
+</div>
+</div>
 <a class="anchor" id="ga7010a4402f54a45ba822ad8675a4655e"></a>
 <div class="memitem">
 <div class="memproto">
@@ -460,7 +490,7 @@
 </div><div class="memdoc">
 
 <p>Disable buffer (re)allocation. </p>
-<p>If passed to <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB or grayscale image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>, this flag will cause those functions to generate an error if the JPEG image buffer is invalid or too small rather than attempting to allocate or reallocate that buffer. This reproduces the behavior of earlier versions of TurboJPEG. </p>
+<p>If passed to <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>, this flag will cause those functions to generate an error if the JPEG image buffer is invalid or too small rather than attempting to allocate or reallocate that buffer. This reproduces the behavior of earlier versions of TurboJPEG. </p>
 
 </div>
 </div>
@@ -613,6 +643,42 @@
 </div>
 </div>
 <h2 class="groupheader">Enumeration Type Documentation</h2>
+<a class="anchor" id="ga4f83ad3368e0e29d1957be0efa7c3720"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">enum <a class="el" href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">TJCS</a></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>JPEG colorspaces. </p>
+<table class="fieldtable">
+<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555"></a>TJCS_RGB</em>&nbsp;</td><td class="fielddoc">
+<p>RGB colorspace. </p>
+<p>When compressing the JPEG image, the R, G, and B components in the source image are reordered into image planes, but no colorspace conversion or subsampling is performed. RGB JPEG images can be decompressed to any of the extended RGB pixel formats or grayscale, but they cannot be decompressed to YUV images. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75"></a>TJCS_YCbCr</em>&nbsp;</td><td class="fielddoc">
+<p>YCbCr colorspace. </p>
+<p>YCbCr is not an absolute colorspace but rather a mathematical transformation of RGB designed solely for storage and transmission. YCbCr images must be converted to RGB before they can actually be displayed. In the YCbCr colorspace, the Y (luminance) component represents the black &amp; white portion of the original image, and the Cb and Cr (chrominance) components represent the color portion of the original image. Originally, the analog equivalent of this transformation allowed the same signal to drive both black &amp; white and color televisions, but JPEG images use YCbCr primarily because it allows the color data to be optionally subsampled for the purposes of reducing bandwidth or disk space. YCbCr is the most common JPEG colorspace, and YCbCr JPEG images can be compressed from and decompressed to any of the extended RGB pixel formats or grayscale, or they can be decompressed to YUV planar images. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a"></a>TJCS_GRAY</em>&nbsp;</td><td class="fielddoc">
+<p>Grayscale colorspace. </p>
+<p>The JPEG image retains only the luminance data (Y component), and any color data from the source image is discarded. Grayscale JPEG images can be compressed from and decompressed to any of the extended RGB pixel formats or grayscale, or they can be decompressed to YUV planar images. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53"></a>TJCS_CMYK</em>&nbsp;</td><td class="fielddoc">
+<p>CMYK colorspace. </p>
+<p>When compressing the JPEG image, the C, M, Y, and K components in the source image are reordered into image planes, but no colorspace conversion or subsampling is performed. CMYK JPEG images can only be decompressed to CMYK pixels. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e"></a>TJCS_YCCK</em>&nbsp;</td><td class="fielddoc">
+<p>YCCK colorspace. </p>
+<p>YCCK (AKA "YCbCrK") is not an absolute colorspace but rather a mathematical transformation of CMYK designed solely for storage and transmission. It is to CMYK as YCbCr is to RGB. CMYK pixels can be reversibly transformed into YCCK, and as with YCbCr, the chrominance components in the YCCK pixels can be subsampled without incurring major perceptual loss. YCCK JPEG images can only be compressed from and decompressed to CMYK pixels. </p>
+</td></tr>
+</table>
+
+</div>
+</div>
 <a class="anchor" id="gac916144e26c3817ac514e64ae5d12e2a"></a>
 <div class="memitem">
 <div class="memproto">
@@ -669,6 +735,10 @@
 <p>ARGB pixel format. </p>
 <p>This is the same as <a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aadae996905efcfa3b42a0bb3bea7f9d84">TJPF_XRGB</a>, except that when decompressing, the X component is guaranteed to be 0xFF, which can be interpreted as an opaque alpha channel. </p>
 </td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b"></a>TJPF_CMYK</em>&nbsp;</td><td class="fielddoc">
+<p>CMYK pixel format. </p>
+<p>Unlike RGB, which is an additive color model used primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive color model used primarily for printing. In the CMYK color model, the value of each color component typically corresponds to an amount of cyan, magenta, yellow, or black ink that is applied to a white background. In order to convert between CMYK and RGB, it is necessary to use a color management system (CMS.) A CMS will attempt to map colors within the printer's gamut to perceptually similar colors in the display's gamut and vice versa, but the mapping is typically not 1:1 or reversible, nor can it be defined with a simple formula. Thus, such a conversion is out of scope for a codec library. However, the TurboJPEG API allows for compressing CMYK pixels into a YCCK JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e" title="YCCK colorspace.">TJCS_YCCK</a>) and decompressing YCCK JPEG images into CMYK pixels. </p>
+</td></tr>
 </table>
 
 </div>
@@ -684,7 +754,8 @@
 </div><div class="memdoc">
 
 <p>Chrominance subsampling options. </p>
-<p>When an image is converted from the RGB to the YUV colorspace as part of the JPEG compression process, some of the U and V (chrominance) components can be discarded or averaged together to produce a smaller image with little perceptible loss of image clarity (the human eye is more sensitive to small changes in brightness than small changes in color.) This is called "chrominance subsampling". </p>
+<p>When pixels are converted from RGB to YCbCr (see <a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75" title="YCbCr colorspace.">TJCS_YCbCr</a>) or from CMYK to YCCK (see <a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e" title="YCCK colorspace.">TJCS_YCCK</a>) as part of the JPEG compression process, some of the Cb and Cr (chrominance) components can be discarded or averaged together to produce a smaller image with little perceptible loss of image clarity (the human eye is more sensitive to small changes in brightness than to small changes in color.) This is called "chrominance subsampling". </p>
+<p>NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the convention of the digital video community, the TurboJPEG API uses "YUV" to refer to an image format consisting of Y, Cb, and Cr image planes. </p>
 <table class="fieldtable">
 <tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="gga1d047060ea80bb9820d540bb928e9074afb8da4f44197837bdec0a4f593dacae3"></a>TJSAMP_444</em>&nbsp;</td><td class="fielddoc">
 <p>4:4:4 chrominance subsampling (no chrominance subsampling). </p>
@@ -706,6 +777,10 @@
 <p>4:4:0 chrominance subsampling. </p>
 <p>The JPEG or YUV image will contain one chrominance component for every 1x2 block of pixels in the source image. Note that 4:4:0 subsampling is not fully accelerated in libjpeg-turbo. </p>
 </td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2"></a>TJSAMP_411</em>&nbsp;</td><td class="fielddoc">
+<p>4:1:1 chrominance subsampling. </p>
+<p>The JPEG or YUV image will contain one chrominance component for every 4x1 block of pixels in the source image. JPEG images compressed with 4:1:1 subsampling will be almost exactly the same size as those compressed with 4:2:0 subsampling, and in the aggregate, both subsampling methods produce approximately the same perceptual quality. However, 4:1:1 is better able to reproduce sharp horizontal features. Note that 4:1:1 subsampling is not fully accelerated in libjpeg-turbo. </p>
+</td></tr>
 </table>
 
 </div>
@@ -771,7 +846,7 @@
 </div><div class="memdoc">
 
 <p>Allocate an image buffer for use with TurboJPEG. </p>
-<p>You should always use this function to allocate the JPEG destination buffer(s) for <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB or grayscale image into a JPEG image.">tjCompress2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> unless you are disabling automatic buffer (re)allocation (by setting <a class="el" href="group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963" title="Disable buffer (re)allocation.">TJFLAG_NOREALLOC</a>.)</p>
+<p>You should always use this function to allocate the JPEG destination buffer(s) for <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> unless you are disabling automatic buffer (re)allocation (by setting <a class="el" href="group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963" title="Disable buffer (re)allocation.">TJFLAG_NOREALLOC</a>.)</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">bytes</td><td>the number of bytes to allocate</td></tr>
@@ -827,12 +902,12 @@
 
 </div>
 </div>
-<a class="anchor" id="ga9d0cb06fd5052d21b6f2b382db8b219c"></a>
+<a class="anchor" id="gaf451664a62c1f6c7cc5a6401f32908c9"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT unsigned long DLLCALL tjBufSizeYUV </td>
+          <td class="memname">DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2 </td>
           <td>(</td>
           <td class="paramtype">int&#160;</td>
           <td class="paramname"><em>width</em>, </td>
@@ -841,6 +916,12 @@
           <td class="paramkey"></td>
           <td></td>
           <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pad</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
           <td class="paramname"><em>height</em>, </td>
         </tr>
         <tr>
@@ -861,6 +942,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">width</td><td>width of the image (in pixels) </td></tr>
+    <tr><td class="paramname">pad</td><td>the width of each line in each plane of the image is padded to the nearest multiple of this number of bytes (must be a power of 2.) </td></tr>
     <tr><td class="paramname">height</td><td>height of the image (in pixels) </td></tr>
     <tr><td class="paramname">subsamp</td><td>level of chrominance subsampling in the image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.)</td></tr>
   </table>
@@ -948,11 +1030,11 @@
       </table>
 </div><div class="memdoc">
 
-<p>Compress an RGB or grayscale image into a JPEG image. </p>
+<p>Compress an RGB, grayscale, or CMYK image into a JPEG image. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance </td></tr>
-    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB or grayscale pixels to be compressed </td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB, grayscale, or CMYK pixels to be compressed </td></tr>
     <tr><td class="paramname">width</td><td>width (in pixels) of the source image </td></tr>
     <tr><td class="paramname">pitch</td><td>bytes per line of the source image. Normally, this should be <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the image is unpadded, or <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. You can also be clever and use this parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>. </td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image </td></tr>
@@ -1040,7 +1122,7 @@
       </table>
 </div><div class="memdoc">
 
-<p>Decompress a JPEG image to an RGB or grayscale image. </p>
+<p>Decompress a JPEG image to an RGB, grayscale, or CMYK image. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance </td></tr>
@@ -1048,26 +1130,23 @@
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes) </td></tr>
     <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the decompressed image. This buffer should normally be <code>pitch * scaledHeight</code> bytes in size, where <code>scaledHeight</code> can be determined by calling <a class="el" href="group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df" title="Compute the scaled value of dimension using the given scaling factor.">TJSCALED()</a> with the JPEG image height and one of the scaling factors returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8" title="Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of Tur...">tjGetScalingFactors()</a>. The <code>dstBuf</code> pointer may also be used to decompress into a specific region of a larger buffer. </td></tr>
     <tr><td class="paramname">width</td><td>desired width (in pixels) of the destination image. If this is different than the width of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired width. If <code>width</code> is set to 0, then only the height will be considered when determining the scaled image size. </td></tr>
-    <tr><td class="paramname">pitch</td><td>bytes per line of the destination image. Normally, this is <code>scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the decompressed image is unpadded, else <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the decompressed image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. (NOTE: <code>scaledWidth</code> can be determined by calling <a class="el" href="group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df" title="Compute the scaled value of dimension using the given scaling factor.">TJSCALED()</a> with the JPEG image width and one of the scaling factors returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8" title="Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of Tur...">tjGetScalingFactors()</a>.) You can also be clever and use the pitch parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>scaledWidth<ul>
-<li><a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]. </li>
-</ul>
-</code></td></tr>
-    <tr><td class="paramname">height</td><td><code>desired height (in pixels) of the destination image. If this is different than the height of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired height. If <code>height</code> is set to 0, then only the width will be considered when determining the scaled image size. </code></td></tr>
-    <tr><td class="paramname">pixelFormat</td><td><code>pixel format of the destination image (see <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">Pixel formats</a>.) </code></td></tr>
-    <tr><td class="paramname">flags</td><td><code>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</code></td></tr>
+    <tr><td class="paramname">pitch</td><td>bytes per line of the destination image. Normally, this is <code>scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the decompressed image is unpadded, else <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the decompressed image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. (NOTE: <code>scaledWidth</code> can be determined by calling <a class="el" href="group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df" title="Compute the scaled value of dimension using the given scaling factor.">TJSCALED()</a> with the JPEG image width and one of the scaling factors returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8" title="Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of Tur...">tjGetScalingFactors()</a>.) You can also be clever and use the pitch parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>. </td></tr>
+    <tr><td class="paramname">height</td><td>desired height (in pixels) of the destination image. If this is different than the height of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired height. If <code>height</code> is set to 0, then only the width will be considered when determining the scaled image size. </td></tr>
+    <tr><td class="paramname">pixelFormat</td><td>pixel format of the destination image (see <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">Pixel formats</a>.) </td></tr>
+    <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd><code> 0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </code></dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
 
 </div>
 </div>
-<a class="anchor" id="gac5675fceb7997b385516cdffdb34e6aa"></a>
+<a class="anchor" id="gacd0fac3af74b3511d39b4781b7103086"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT int DLLCALL tjDecompressHeader2 </td>
+          <td class="memname">DLLEXPORT int DLLCALL tjDecompressHeader3 </td>
           <td>(</td>
           <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
           <td class="paramname"><em>handle</em>, </td>
@@ -1100,7 +1179,13 @@
           <td class="paramkey"></td>
           <td></td>
           <td class="paramtype">int *&#160;</td>
-          <td class="paramname"><em>jpegSubsamp</em>&#160;</td>
+          <td class="paramname"><em>jpegSubsamp</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int *&#160;</td>
+          <td class="paramname"><em>jpegColorspace</em>&#160;</td>
         </tr>
         <tr>
           <td></td>
@@ -1118,7 +1203,8 @@
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes) </td></tr>
     <tr><td class="paramname">width</td><td>pointer to an integer variable that will receive the width (in pixels) of the JPEG image </td></tr>
     <tr><td class="paramname">height</td><td>pointer to an integer variable that will receive the height (in pixels) of the JPEG image </td></tr>
-    <tr><td class="paramname">jpegSubsamp</td><td>pointer to an integer variable that will receive the level of chrominance subsampling used when compressing the JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.)</td></tr>
+    <tr><td class="paramname">jpegSubsamp</td><td>pointer to an integer variable that will receive the level of chrominance subsampling used when compressing the JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.) </td></tr>
+    <tr><td class="paramname">jpegColorspace</td><td>pointer to an integer variable that will receive one of the JPEG colorspace constants, indicating the colorspace of the JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">JPEG colorspaces</a>.)</td></tr>
   </table>
   </dd>
 </dl>
@@ -1126,12 +1212,12 @@
 
 </div>
 </div>
-<a class="anchor" id="gad7810af095624a4016e72957a50f77d8"></a>
+<a class="anchor" id="ga7c08b340ad7f8e85d407bd9e81d44d07"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT int DLLCALL tjDecompressToYUV </td>
+          <td class="memname">DLLEXPORT int DLLCALL tjDecompressToYUV2 </td>
           <td>(</td>
           <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
           <td class="paramname"><em>handle</em>, </td>
@@ -1158,6 +1244,24 @@
           <td class="paramkey"></td>
           <td></td>
           <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>width</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pad</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>height</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
           <td class="paramname"><em>flags</em>&#160;</td>
         </tr>
         <tr>
@@ -1169,13 +1273,17 @@
 </div><div class="memdoc">
 
 <p>Decompress a JPEG image to a YUV planar image. </p>
-<p>This function performs JPEG decompression but leaves out the color conversion step, so a planar YUV image is generated instead of an RGB image. The padding of the planes in this image is the same as in the images generated by <a class="el" href="group___turbo_j_p_e_g.html#ga0fa4e7b1943687c6a0c0304529c55d35" title="Encode an RGB or grayscale image into a YUV planar image.">tjEncodeYUV2()</a>. Note that, if the width or height of the image is not an even multiple of the MCU block size (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a> and <a class="el" href="group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf" title="MCU block height (in pixels) for a given level of chrominance subsampling.">tjMCUHeight</a>), then an intermediate buffer copy will be performed within TurboJPEG.</p>
+<p>This function performs JPEG decompression but leaves out the color conversion step, so a planar YUV image is generated instead of an RGB image. The structure of the planes in this image is the same as in the images generated by <a class="el" href="group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360" title="Encode an RGB or grayscale image into a YUV planar image.">tjEncodeYUV3()</a>. Note that, if the width or height of the JPEG image is not an even multiple of the MCU block size (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a> and <a class="el" href="group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf" title="MCU block height (in pixels) for a given level of chrominance subsampling.">tjMCUHeight</a>), then an intermediate buffer copy will be performed within TurboJPEG. </p>
+<p>NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the convention of the digital video community, the TurboJPEG API uses "YUV" to refer to an image format consisting of Y, Cb, and Cr image planes.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance </td></tr>
     <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG image to decompress </td></tr>
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes) </td></tr>
-    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV()</a> to determine the appropriate size for this buffer based on the image width, height, and level of subsampling. </td></tr>
+    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> to determine the appropriate size for this buffer based on the image width, height, padding, and level of subsampling. </td></tr>
+    <tr><td class="paramname">width</td><td>desired width (in pixels) of the YUV image. If this is different than the width of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired width. If <code>width</code> is set to 0, then only the height will be considered when determining the scaled image size. </td></tr>
+    <tr><td class="paramname">pad</td><td>the width of each line in each plane of the YUV image will be padded to the nearest multiple of this number of bytes (must be a power of 2.) To generate images suitable for X Video, <code>pad</code> should be set to 4. </td></tr>
+    <tr><td class="paramname">height</td><td>desired height (in pixels) of the YUV image. If this is different than the height of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired height. If <code>height</code> is set to 0, then only the width will be considered when determining the scaled image size. </td></tr>
     <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
   </table>
   </dd>
@@ -1209,12 +1317,12 @@
 
 </div>
 </div>
-<a class="anchor" id="ga0fa4e7b1943687c6a0c0304529c55d35"></a>
+<a class="anchor" id="ga0a5ffbf7cb58a5b6a8201114fe889360"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT int DLLCALL tjEncodeYUV2 </td>
+          <td class="memname">DLLEXPORT int DLLCALL tjEncodeYUV3 </td>
           <td>(</td>
           <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
           <td class="paramname"><em>handle</em>, </td>
@@ -1259,6 +1367,12 @@
           <td class="paramkey"></td>
           <td></td>
           <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pad</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
           <td class="paramname"><em>subsamp</em>, </td>
         </tr>
         <tr>
@@ -1276,7 +1390,8 @@
 </div><div class="memdoc">
 
 <p>Encode an RGB or grayscale image into a YUV planar image. </p>
-<p>This function uses the accelerated color conversion routines in TurboJPEG's underlying codec to produce a planar YUV image that is suitable for X Video. Specifically, if the chrominance components are subsampled along the horizontal dimension, then the width of the luminance plane is padded to the nearest multiple of 2 in the output image (same goes for the height of the luminance plane, if the chrominance components are subsampled along the vertical dimension.) Also, each line of each plane in the output image is padded to 4 bytes. Although this will work with any subsampling option, it is really only useful in combination with TJ_420, which produces an image compatible with the I420 (AKA "YUV420P") format.</p>
+<p>This function uses the accelerated color conversion routines in TurboJPEG's underlying codec but does not execute any of the other steps in the JPEG compression process. The Y, U (Cb), and V (Cr) image planes are stored sequentially into the destination buffer, and the size of each plane is determined by the width and height of the source image, as well as the specified padding and level of chrominance subsampling. If the chrominance components are subsampled along the horizontal dimension, then the width of the luminance plane is padded to the nearest multiple of 2 in the output image (same goes for the height of the luminance plane, if the chrominance components are subsampled along the vertical dimension.) </p>
+<p>NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the convention of the digital video community, the TurboJPEG API uses "YUV" to refer to an image format consisting of Y, Cb, and Cr image planes.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance </td></tr>
@@ -1285,8 +1400,9 @@
     <tr><td class="paramname">pitch</td><td>bytes per line of the source image. Normally, this should be <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the image is unpadded, or <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. You can also be clever and use this parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>. </td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image </td></tr>
     <tr><td class="paramname">pixelFormat</td><td>pixel format of the source image (see <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">Pixel formats</a>.) </td></tr>
-    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV()</a> to determine the appropriate size for this buffer based on the image width, height, and level of chrominance subsampling. </td></tr>
-    <tr><td class="paramname">subsamp</td><td>the level of chrominance subsampling to be used when generating the YUV image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.) </td></tr>
+    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> to determine the appropriate size for this buffer based on the image width, height, padding, and level of chrominance subsampling. </td></tr>
+    <tr><td class="paramname">pad</td><td>the width of each line in each plane of the YUV image will be padded to the nearest multiple of this number of bytes (must be a power of 2.) To generate images suitable for X Video, <code>pad</code> should be set to 4. </td></tr>
+    <tr><td class="paramname">subsamp</td><td>the level of chrominance subsampling to be used when generating the YUV image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.) To generate images suitable for X Video, <code>subsamp</code> should be set to <a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a63085dbf683cfe39e513cdb6343e3737">TJSAMP_420</a>. This produces an image compatible with the I420 (AKA "YUV420P") format. </td></tr>
     <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
   </table>
   </dd>
@@ -1310,7 +1426,7 @@
 </div><div class="memdoc">
 
 <p>Free an image buffer previously allocated by TurboJPEG. </p>
-<p>You should always use this function to free JPEG destination buffer(s) that were automatically (re)allocated by <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB or grayscale image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> or that were manually allocated using <a class="el" href="group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff" title="Allocate an image buffer for use with TurboJPEG.">tjAlloc()</a>.</p>
+<p>You should always use this function to free JPEG destination buffer(s) that were automatically (re)allocated by <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> or that were manually allocated using <a class="el" href="group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff" title="Allocate an image buffer for use with TurboJPEG.">tjAlloc()</a>.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">buffer</td><td>address of the buffer to free</td></tr>
@@ -1577,7 +1693,8 @@
 <li>8x8 for no subsampling or grayscale</li>
 <li>16x8 for 4:2:2</li>
 <li>8x16 for 4:4:0</li>
-<li>16x16 for 4:2:0 </li>
+<li>16x16 for 4:2:0</li>
+<li>32x8 for 4:1:1 </li>
 </ul>
 
 </div>
@@ -1606,7 +1723,8 @@
 <li>8x8 for no subsampling or grayscale</li>
 <li>16x8 for 4:2:2</li>
 <li>8x16 for 4:4:0</li>
-<li>16x16 for 4:2:0 </li>
+<li>16x16 for 4:2:0</li>
+<li>32x8 for 4:1:1 </li>
 </ul>
 
 </div>
diff --git a/doc/html/index.html b/doc/html/index.html
index 72daeb8..139b84c 100644
--- a/doc/html/index.html
+++ b/doc/html/index.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/modules.html b/doc/html/modules.html
index 25b7ec6..6b769e4 100644
--- a/doc/html/modules.html
+++ b/doc/html/modules.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/search/all_74.js b/doc/html/search/all_74.js
index a1927ba..4aec528 100644
--- a/doc/html/search/all_74.js
+++ b/doc/html/search/all_74.js
@@ -1,18 +1,25 @@
 var searchData=
 [
+  ['tj_5fnumcs',['TJ_NUMCS',['../group___turbo_j_p_e_g.html#ga39f57a6fb02d9cf32e7b6890099b5a71',1,'turbojpeg.h']]],
   ['tj_5fnumpf',['TJ_NUMPF',['../group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e',1,'turbojpeg.h']]],
   ['tj_5fnumsamp',['TJ_NUMSAMP',['../group___turbo_j_p_e_g.html#ga5ef3d169162ce77ce348e292a0b7477c',1,'turbojpeg.h']]],
   ['tj_5fnumxop',['TJ_NUMXOP',['../group___turbo_j_p_e_g.html#ga0f6dbd18adf38b7d46ac547f0f4d562c',1,'turbojpeg.h']]],
   ['tjalloc',['tjAlloc',['../group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff',1,'turbojpeg.h']]],
   ['tjblueoffset',['tjBlueOffset',['../group___turbo_j_p_e_g.html#ga84e2e35d3f08025f976ec1ec53693dea',1,'turbojpeg.h']]],
   ['tjbufsize',['tjBufSize',['../group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b',1,'turbojpeg.h']]],
-  ['tjbufsizeyuv',['tjBufSizeYUV',['../group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c',1,'turbojpeg.h']]],
+  ['tjbufsizeyuv2',['tjBufSizeYUV2',['../group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9',1,'turbojpeg.h']]],
   ['tjcompress2',['tjCompress2',['../group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2',1,'turbojpeg.h']]],
+  ['tjcs',['TJCS',['../group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720',1,'turbojpeg.h']]],
+  ['tjcs_5fcmyk',['TJCS_CMYK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53',1,'turbojpeg.h']]],
+  ['tjcs_5fgray',['TJCS_GRAY',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a',1,'turbojpeg.h']]],
+  ['tjcs_5frgb',['TJCS_RGB',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555',1,'turbojpeg.h']]],
+  ['tjcs_5fycbcr',['TJCS_YCbCr',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75',1,'turbojpeg.h']]],
+  ['tjcs_5fycck',['TJCS_YCCK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e',1,'turbojpeg.h']]],
   ['tjdecompress2',['tjDecompress2',['../group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9',1,'turbojpeg.h']]],
-  ['tjdecompressheader2',['tjDecompressHeader2',['../group___turbo_j_p_e_g.html#gac5675fceb7997b385516cdffdb34e6aa',1,'turbojpeg.h']]],
-  ['tjdecompresstoyuv',['tjDecompressToYUV',['../group___turbo_j_p_e_g.html#gad7810af095624a4016e72957a50f77d8',1,'turbojpeg.h']]],
+  ['tjdecompressheader3',['tjDecompressHeader3',['../group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086',1,'turbojpeg.h']]],
+  ['tjdecompresstoyuv2',['tjDecompressToYUV2',['../group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07',1,'turbojpeg.h']]],
   ['tjdestroy',['tjDestroy',['../group___turbo_j_p_e_g.html#ga674adee917b95ad4a896f1ba39e12540',1,'turbojpeg.h']]],
-  ['tjencodeyuv2',['tjEncodeYUV2',['../group___turbo_j_p_e_g.html#ga0fa4e7b1943687c6a0c0304529c55d35',1,'turbojpeg.h']]],
+  ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360',1,'turbojpeg.h']]],
   ['tjflag_5faccuratedct',['TJFLAG_ACCURATEDCT',['../group___turbo_j_p_e_g.html#gacb233cfd722d66d1ccbf48a7de81f0e0',1,'turbojpeg.h']]],
   ['tjflag_5fbottomup',['TJFLAG_BOTTOMUP',['../group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec',1,'turbojpeg.h']]],
   ['tjflag_5ffastdct',['TJFLAG_FASTDCT',['../group___turbo_j_p_e_g.html#gaabce235db80d3f698b27f36cbd453da2',1,'turbojpeg.h']]],
@@ -39,6 +46,7 @@
   ['tjpf_5fbgr',['TJPF_BGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aab10624437fb8ef495a0b153e65749839',1,'turbojpeg.h']]],
   ['tjpf_5fbgra',['TJPF_BGRA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aac037ff1845cf9b74bb81a3659c2b9fb4',1,'turbojpeg.h']]],
   ['tjpf_5fbgrx',['TJPF_BGRX',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa2a1fbf569ca79897eae886e3376ca4c8',1,'turbojpeg.h']]],
+  ['tjpf_5fcmyk',['TJPF_CMYK',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b',1,'turbojpeg.h']]],
   ['tjpf_5fgray',['TJPF_GRAY',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa5431b54b015337705f13118073711a1a',1,'turbojpeg.h']]],
   ['tjpf_5frgb',['TJPF_RGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7ce93230bff449518ce387c17e6ed37c',1,'turbojpeg.h']]],
   ['tjpf_5frgba',['TJPF_RGBA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa88d2e88fab67f6503cf972e14851cc12',1,'turbojpeg.h']]],
@@ -49,6 +57,7 @@
   ['tjredoffset',['tjRedOffset',['../group___turbo_j_p_e_g.html#gadd9b446742ac8a3923f7992c7988fea8',1,'turbojpeg.h']]],
   ['tjregion',['tjregion',['../structtjregion.html',1,'']]],
   ['tjsamp',['TJSAMP',['../group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074',1,'turbojpeg.h']]],
+  ['tjsamp_5f411',['TJSAMP_411',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2',1,'turbojpeg.h']]],
   ['tjsamp_5f420',['TJSAMP_420',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a63085dbf683cfe39e513cdb6343e3737',1,'turbojpeg.h']]],
   ['tjsamp_5f422',['TJSAMP_422',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a136130902cc578f11f32429b59368404',1,'turbojpeg.h']]],
   ['tjsamp_5f440',['TJSAMP_440',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974',1,'turbojpeg.h']]],
diff --git a/doc/html/search/enums_74.js b/doc/html/search/enums_74.js
index 20bd4db..276aa24 100644
--- a/doc/html/search/enums_74.js
+++ b/doc/html/search/enums_74.js
@@ -1,5 +1,6 @@
 var searchData=
 [
+  ['tjcs',['TJCS',['../group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720',1,'turbojpeg.h']]],
   ['tjpf',['TJPF',['../group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a',1,'turbojpeg.h']]],
   ['tjsamp',['TJSAMP',['../group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074',1,'turbojpeg.h']]],
   ['tjxop',['TJXOP',['../group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866',1,'turbojpeg.h']]]
diff --git a/doc/html/search/enumvalues_74.js b/doc/html/search/enumvalues_74.js
index 55664f1..7dc2f8d 100644
--- a/doc/html/search/enumvalues_74.js
+++ b/doc/html/search/enumvalues_74.js
@@ -1,16 +1,23 @@
 var searchData=
 [
+  ['tjcs_5fcmyk',['TJCS_CMYK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53',1,'turbojpeg.h']]],
+  ['tjcs_5fgray',['TJCS_GRAY',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a',1,'turbojpeg.h']]],
+  ['tjcs_5frgb',['TJCS_RGB',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555',1,'turbojpeg.h']]],
+  ['tjcs_5fycbcr',['TJCS_YCbCr',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75',1,'turbojpeg.h']]],
+  ['tjcs_5fycck',['TJCS_YCCK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e',1,'turbojpeg.h']]],
   ['tjpf_5fabgr',['TJPF_ABGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa1ba1a7f1631dbeaa49a0a85fc4a40081',1,'turbojpeg.h']]],
   ['tjpf_5fargb',['TJPF_ARGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aae8f846ed9d9de99b6e1dfe448848765c',1,'turbojpeg.h']]],
   ['tjpf_5fbgr',['TJPF_BGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aab10624437fb8ef495a0b153e65749839',1,'turbojpeg.h']]],
   ['tjpf_5fbgra',['TJPF_BGRA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aac037ff1845cf9b74bb81a3659c2b9fb4',1,'turbojpeg.h']]],
   ['tjpf_5fbgrx',['TJPF_BGRX',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa2a1fbf569ca79897eae886e3376ca4c8',1,'turbojpeg.h']]],
+  ['tjpf_5fcmyk',['TJPF_CMYK',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b',1,'turbojpeg.h']]],
   ['tjpf_5fgray',['TJPF_GRAY',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa5431b54b015337705f13118073711a1a',1,'turbojpeg.h']]],
   ['tjpf_5frgb',['TJPF_RGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7ce93230bff449518ce387c17e6ed37c',1,'turbojpeg.h']]],
   ['tjpf_5frgba',['TJPF_RGBA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa88d2e88fab67f6503cf972e14851cc12',1,'turbojpeg.h']]],
   ['tjpf_5frgbx',['TJPF_RGBX',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa83973bebb7e2dc6fa8bae89ff3f42e01',1,'turbojpeg.h']]],
   ['tjpf_5fxbgr',['TJPF_XBGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aaf6603b27147de47e212e75dac027b2af',1,'turbojpeg.h']]],
   ['tjpf_5fxrgb',['TJPF_XRGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aadae996905efcfa3b42a0bb3bea7f9d84',1,'turbojpeg.h']]],
+  ['tjsamp_5f411',['TJSAMP_411',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2',1,'turbojpeg.h']]],
   ['tjsamp_5f420',['TJSAMP_420',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a63085dbf683cfe39e513cdb6343e3737',1,'turbojpeg.h']]],
   ['tjsamp_5f422',['TJSAMP_422',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a136130902cc578f11f32429b59368404',1,'turbojpeg.h']]],
   ['tjsamp_5f440',['TJSAMP_440',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974',1,'turbojpeg.h']]],
diff --git a/doc/html/search/functions_74.js b/doc/html/search/functions_74.js
index c746a91..8f92c26 100644
--- a/doc/html/search/functions_74.js
+++ b/doc/html/search/functions_74.js
@@ -2,13 +2,13 @@
 [
   ['tjalloc',['tjAlloc',['../group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff',1,'turbojpeg.h']]],
   ['tjbufsize',['tjBufSize',['../group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b',1,'turbojpeg.h']]],
-  ['tjbufsizeyuv',['tjBufSizeYUV',['../group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c',1,'turbojpeg.h']]],
+  ['tjbufsizeyuv2',['tjBufSizeYUV2',['../group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9',1,'turbojpeg.h']]],
   ['tjcompress2',['tjCompress2',['../group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2',1,'turbojpeg.h']]],
   ['tjdecompress2',['tjDecompress2',['../group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9',1,'turbojpeg.h']]],
-  ['tjdecompressheader2',['tjDecompressHeader2',['../group___turbo_j_p_e_g.html#gac5675fceb7997b385516cdffdb34e6aa',1,'turbojpeg.h']]],
-  ['tjdecompresstoyuv',['tjDecompressToYUV',['../group___turbo_j_p_e_g.html#gad7810af095624a4016e72957a50f77d8',1,'turbojpeg.h']]],
+  ['tjdecompressheader3',['tjDecompressHeader3',['../group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086',1,'turbojpeg.h']]],
+  ['tjdecompresstoyuv2',['tjDecompressToYUV2',['../group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07',1,'turbojpeg.h']]],
   ['tjdestroy',['tjDestroy',['../group___turbo_j_p_e_g.html#ga674adee917b95ad4a896f1ba39e12540',1,'turbojpeg.h']]],
-  ['tjencodeyuv2',['tjEncodeYUV2',['../group___turbo_j_p_e_g.html#ga0fa4e7b1943687c6a0c0304529c55d35',1,'turbojpeg.h']]],
+  ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360',1,'turbojpeg.h']]],
   ['tjfree',['tjFree',['../group___turbo_j_p_e_g.html#ga8c4a1231dc06a450514c835f6471f137',1,'turbojpeg.h']]],
   ['tjgeterrorstr',['tjGetErrorStr',['../group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf',1,'turbojpeg.h']]],
   ['tjgetscalingfactors',['tjGetScalingFactors',['../group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8',1,'turbojpeg.h']]],
diff --git a/doc/html/structtjregion.html b/doc/html/structtjregion.html
index 9ecd917..515686c 100644
--- a/doc/html/structtjregion.html
+++ b/doc/html/structtjregion.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/structtjscalingfactor.html b/doc/html/structtjscalingfactor.html
index 33c7366..f34e150 100644
--- a/doc/html/structtjscalingfactor.html
+++ b/doc/html/structtjscalingfactor.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/structtjtransform.html b/doc/html/structtjtransform.html
index 59198bb..9f2939a 100644
--- a/doc/html/structtjtransform.html
+++ b/doc/html/structtjtransform.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
@@ -108,7 +108,7 @@
 <tr class="memdesc:a688fe8f1a8ecc12a538d9e561cf338e3"><td class="mdescLeft">&#160;</td><td class="mdescRight">Arbitrary data that can be accessed within the body of the callback function.  <a href="#a688fe8f1a8ecc12a538d9e561cf338e3">More...</a><br/></td></tr>
 <tr class="separator:a688fe8f1a8ecc12a538d9e561cf338e3"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a43ee1bcdd2a8d7249a756774f78793c1"><td class="memItemLeft" align="right" valign="top">int(*&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structtjtransform.html#a43ee1bcdd2a8d7249a756774f78793c1">customFilter</a> )(short *coeffs, <a class="el" href="structtjregion.html">tjregion</a> arrayRegion, <a class="el" href="structtjregion.html">tjregion</a> planeRegion, int componentIndex, int transformIndex, struct <a class="el" href="structtjtransform.html">tjtransform</a> *transform)</td></tr>
-<tr class="memdesc:a43ee1bcdd2a8d7249a756774f78793c1"><td class="mdescLeft">&#160;</td><td class="mdescRight">A callback function that can be used to modify the DCT coefficients after they are losslessly transformed but before they are transcoded to a new JPEG file.  <a href="#a43ee1bcdd2a8d7249a756774f78793c1">More...</a><br/></td></tr>
+<tr class="memdesc:a43ee1bcdd2a8d7249a756774f78793c1"><td class="mdescLeft">&#160;</td><td class="mdescRight">A callback function that can be used to modify the DCT coefficients after they are losslessly transformed but before they are transcoded to a new JPEG image.  <a href="#a43ee1bcdd2a8d7249a756774f78793c1">More...</a><br/></td></tr>
 <tr class="separator:a43ee1bcdd2a8d7249a756774f78793c1"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
@@ -124,7 +124,7 @@
       </table>
 </div><div class="memdoc">
 
-<p>A callback function that can be used to modify the DCT coefficients after they are losslessly transformed but before they are transcoded to a new JPEG file. </p>
+<p>A callback function that can be used to modify the DCT coefficients after they are losslessly transformed but before they are transcoded to a new JPEG image. </p>
 <p>This allows for custom filters or other transformations to be applied in the frequency domain.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
diff --git a/doxygen.config b/doxygen.config
index b881d82..9680175 100644
--- a/doxygen.config
+++ b/doxygen.config
@@ -1,5 +1,5 @@
 PROJECT_NAME = TurboJPEG
-PROJECT_NUMBER = 1.2.1
+PROJECT_NUMBER = 1.4
 OUTPUT_DIRECTORY = doc/
 USE_WINDOWS_ENCODING = NO
 OPTIMIZE_OUTPUT_FOR_C = YES
diff --git a/java/TJBench.java b/java/TJBench.java
index eaf5fa3..c4055f2 100644
--- a/java/TJBench.java
+++ b/java/TJBench.java
@@ -45,11 +45,15 @@
   };
 
   static final String[] subNameLong = {
-    "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+    "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
   };
 
   static final String[] subName = {
-    "444", "422", "420", "GRAY", "440"
+    "444", "422", "420", "GRAY", "440", "411"
+  };
+
+  static final String[] csName = {
+    "RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
   };
 
   static TJScalingFactor sf;
@@ -62,6 +66,16 @@
   }
 
 
+  static String formatName(int subsamp, int cs) {
+    if (cs == TJ.CS_YCbCr)
+      return subNameLong[subsamp];
+    else if (cs == TJ.CS_YCCK)
+      return csName[cs] + " " + subNameLong[subsamp];
+    else
+      return csName[cs];
+  }
+
+
   static String sigFig(double val, int figs) {
     String format;
     int digitsAfterDecimal = figs - (int)Math.ceil(Math.log10(Math.abs(val)));
@@ -417,7 +431,7 @@
     byte[] srcBuf;
     int[] jpegSize;
     int totalJpegSize;
-    int w = 0, h = 0, subsamp = -1, _w, _h, _tilew, _tileh,
+    int w = 0, h = 0, subsamp = -1, cs = -1, _w, _h, _tilew, _tileh,
       _ntilesw, _ntilesh, _subsamp, x, y;
     int ntilesw = 1, ntilesh = 1;
     double start, elapsed;
@@ -439,17 +453,22 @@
     w = tjt.getWidth();
     h = tjt.getHeight();
     subsamp = tjt.getSubsamp();
+    cs = tjt.getColorspace();
 
     if (quiet == 1) {
       System.out.println("All performance values in Mpixels/sec\n");
-      System.out.format("Bitmap\tBitmap\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
+      System.out.format("Bitmap\tBitmap\tJPEG\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
                         (doTile ? "Tile " : "Image"),
                         (doTile ? "Tile " : "Image"));
-      System.out.println("Format\tOrder\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n");
+      System.out.println("Format\tOrder\tCS\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n");
     } else if (quiet == 0) {
-      System.out.format(">>>>>  JPEG %s --> %s (%s)  <<<<<",
-        subNameLong[subsamp], pixFormatStr[pf],
-        (flags & TJ.FLAG_BOTTOMUP) != 0 ? "Bottom-up" : "Top-down");
+      if (yuv == YUVDECODE)
+        System.out.format(">>>>>  JPEG %s --> YUV  <<<<<",
+          formatName(subsamp, cs));
+      else
+        System.out.format(">>>>>  JPEG %s --> %s (%s)  <<<<<",
+          formatName(subsamp, cs), pixFormatStr[pf],
+          (flags & TJ.FLAG_BOTTOMUP) != 0 ? "Bottom-up" : "Top-down");
     }
 
     for (int tilew = doTile ? 16 : w, tileh = doTile ? 16 : h; ;
@@ -470,9 +489,9 @@
                             sf.getScaled(_h));
         System.out.println("");
       } else if (quiet == 1) {
-        System.out.format("%s\t%s\t%s\t", pixFormatStr[pf],
+        System.out.format("%s\t%s\t%s\t%s\t", pixFormatStr[pf],
                           (flags & TJ.FLAG_BOTTOMUP) != 0 ? "BU" : "TD",
-                          subNameLong[subsamp]);
+                          csName[cs], subNameLong[subsamp]);
         System.out.format("%-4d  %-4d\t", tilew, tileh);
       }
 
@@ -601,6 +620,7 @@
     System.out.println("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the");
     System.out.println("     underlying codec");
     System.out.println("-440 = Test 4:4:0 chrominance subsampling instead of 4:2:2");
+    System.out.println("-411 = Test 4:1:1 chrominance subsampling instead of 4:2:0");
     System.out.println("-quiet = Output results in tabular rather than verbose format");
     System.out.println("-yuvencode = Encode RGB input as planar YUV rather than compressing as JPEG");
     System.out.println("-yuvdecode = Decode JPEG image to planar YUV rather than RGB");
@@ -637,7 +657,7 @@
     byte[] srcBuf = null;  int w = 0, h = 0;
     int minQual = -1, maxQual = -1;
     int minArg = 1;  int retval = 0;
-    boolean do440 = false;
+    boolean do440 = false, do411 = false;
 
     try {
 
@@ -717,6 +737,8 @@
           }
           if (argv[i].equals("-440"))
             do440 = true;
+          if (argv[i].equals("-411"))
+            do411 = true;
           if (argv[i].equalsIgnoreCase("-rgb"))
             pf = TJ.PF_RGB;
           if (argv[i].equalsIgnoreCase("-rgbx"))
@@ -834,7 +856,7 @@
       System.out.println("");
       System.gc();
       for (int i = maxQual; i >= minQual; i--)
-        doTest(srcBuf, w, h, TJ.SAMP_420, i, argv[0]);
+        doTest(srcBuf, w, h, do411 ? TJ.SAMP_411 : TJ.SAMP_420, i, argv[0]);
       System.out.println("");
       System.gc();
       for (int i = maxQual; i >= minQual; i--)
diff --git a/java/TJUnitTest.java b/java/TJUnitTest.java
index ed67100..56226e2 100644
--- a/java/TJUnitTest.java
+++ b/java/TJUnitTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2012 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2013 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,24 +46,26 @@
     System.out.println("\nUSAGE: java " + classname + " [options]\n");
     System.out.println("Options:\n");
     System.out.println("-yuv = test YUV encoding/decoding support\n");
+    System.out.println("-noyuvpad = do not pad each line of each Y, U, and V plane to the nearest\n");
+    System.out.println("            4-byte boundary\n");
     System.out.println("-bi = test BufferedImage support\n");
     System.exit(1);
   }
 
   private static final String[] subNameLong = {
-    "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+    "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
   };
   private static final String[] subName = {
-    "444", "422", "420", "GRAY", "440"
+    "444", "422", "420", "GRAY", "440", "411"
   };
 
   private static final String[] pixFormatStr = {
     "RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "Grayscale",
-    "RGBA", "BGRA", "ABGR", "ARGB"
+    "RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
   };
 
   private static final int[] alphaOffset = {
-    -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0
+    -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
   };
 
   private static final int[] _3byteFormats = {
@@ -73,7 +75,7 @@
     BufferedImage.TYPE_3BYTE_BGR
   };
   private static final int[] _4byteFormats = {
-    TJ.PF_RGBX, TJ.PF_BGRX, TJ.PF_XBGR, TJ.PF_XRGB
+    TJ.PF_RGBX, TJ.PF_BGRX, TJ.PF_XBGR, TJ.PF_XRGB, TJ.PF_CMYK
   };
   private static final int[] _4byteFormatsBI = {
     BufferedImage.TYPE_INT_BGR, BufferedImage.TYPE_INT_RGB,
@@ -93,6 +95,7 @@
   private static final int YUVENCODE = 1;
   private static final int YUVDECODE = 2;
   private static int yuv = 0;
+  private static int pad = 4;
   private static boolean bi = false;
 
   private static int exitStatus = 0;
@@ -162,8 +165,8 @@
     int ps = TJ.getPixelSize(pf);
     int index, row, col, halfway = 16;
 
-    Arrays.fill(buf, (byte)0);
     if (pf == TJ.PF_GRAY) {
+      Arrays.fill(buf, (byte)0);
       for (row = 0; row < h; row++) {
         for (col = 0; col < w; col++) {
           if ((flags & TJ.FLAG_BOTTOMUP) != 0)
@@ -178,6 +181,27 @@
       }
       return;
     }
+    if (pf == TJ.PF_CMYK) {
+      Arrays.fill(buf, (byte)255);
+      for (row = 0; row < h; row++) {
+        for (col = 0; col < w; col++) {
+          if ((flags & TJ.FLAG_BOTTOMUP) != 0)
+            index = (h - row - 1) * w + col;
+          else
+            index = row * w + col;
+          if (((row / 8) + (col / 8)) % 2 == 0) {
+            if (row >= halfway) buf[index * ps + 3] = 0;
+          } else {
+            buf[index * ps + 2] = 0;
+            if (row < halfway)
+              buf[index * ps + 1] = 0;
+          }
+        }
+      }
+      return;
+    }
+
+    Arrays.fill(buf, (byte)0);
     for (row = 0; row < h; row++) {
       for (col = 0; col < w; col++) {
         if ((flags & TJ.FLAG_BOTTOMUP) != 0)
@@ -296,6 +320,39 @@
     int blockSize = 8 * sf.getNum() / sf.getDenom();
 
     try {
+
+      if (pf == TJ.PF_CMYK) {
+        for (row = 0; row < h; row++) {
+          for (col = 0; col < w; col++) {
+            if ((flags & TJ.FLAG_BOTTOMUP) != 0)
+              index = (h - row - 1) * w + col;
+            else
+              index = row * w + col;
+            byte c = buf[index * ps];
+            byte m = buf[index * ps + 1];
+            byte y = buf[index * ps + 2]; 
+            byte k = buf[index * ps + 3];
+            checkVal255(row, col, c, "C");
+            if (((row / blockSize) + (col / blockSize)) % 2 == 0) {
+              checkVal255(row, col, m, "M");
+              checkVal255(row, col, y, "Y");
+              if (row < halfway)
+                checkVal255(row, col, k, "K");
+              else
+                checkVal0(row, col, k, "K");
+            } else {
+              checkVal0(row, col, y, "Y");
+              checkVal255(row, col, k, "K");
+              if (row < halfway)
+                checkVal0(row, col, m, "M");
+              else
+                checkVal255(row, col, m, "M");
+            }
+          }
+        }
+        return 1;
+      }
+
       for (row = 0; row < halfway; row++) {
         for (col = 0; col < w; col++) {
           if ((flags & TJ.FLAG_BOTTOMUP) != 0)
@@ -348,13 +405,25 @@
     if (retval == 0) {
       for (row = 0; row < h; row++) {
         for (col = 0; col < w; col++) {
-          int r = buf[pitch * row + col * ps + roffset];
-          int g = buf[pitch * row + col * ps + goffset];
-          int b = buf[pitch * row + col * ps + boffset];
-          if (r < 0) r += 256;
-          if (g < 0) g += 256;
-          if (b < 0) b += 256;
-          System.out.format("%3d/%3d/%3d ", r, g, b);
+          if (pf == TJ.PF_CMYK) {
+            int c = buf[pitch * row + col * ps];
+            int m = buf[pitch * row + col * ps + 1];
+            int y = buf[pitch * row + col * ps + 2];
+            int k = buf[pitch * row + col * ps + 3];
+            if (c < 0) c += 256;
+            if (m < 0) m += 256;
+            if (y < 0) y += 256;
+            if (k < 0) k += 256;
+            System.out.format("%3d/%3d/%3d/%3d ", c, m, y, k);
+          } else {
+            int r = buf[pitch * row + col * ps + roffset];
+            int g = buf[pitch * row + col * ps + goffset];
+            int b = buf[pitch * row + col * ps + boffset];
+            if (r < 0) r += 256;
+            if (g < 0) g += 256;
+            if (b < 0) b += 256;
+            System.out.format("%3d/%3d/%3d ", r, g, b);
+          }
         }
         System.out.print("\n");
       }
@@ -470,16 +539,18 @@
   }
 
   private static int checkBufYUV(byte[] buf, int size, int w, int h,
-                                 int subsamp) throws Exception {
+                                 int subsamp, TJScalingFactor sf)
+                                 throws Exception {
     int row, col;
     int hsf = TJ.getMCUWidth(subsamp) / 8, vsf = TJ.getMCUHeight(subsamp) / 8;
     int pw = PAD(w, hsf), ph = PAD(h, vsf);
     int cw = pw / hsf, ch = ph / vsf;
-    int ypitch = PAD(pw, 4), uvpitch = PAD(cw, 4);
+    int ypitch = PAD(pw, pad), uvpitch = PAD(cw, pad);
     int retval = 1;
     int correctsize = ypitch * ph +
                       (subsamp == TJ.SAMP_GRAY ? 0 : uvpitch * ch * 2);
-    int halfway = 16;
+    int halfway = 16 * sf.getNum() / sf.getDenom();
+    int blockSize = 8 * sf.getNum() / sf.getDenom();
 
     try {
       if (size != correctsize)
@@ -489,7 +560,7 @@
       for (row = 0; row < ph; row++) {
         for (col = 0; col < pw; col++) {
           byte y = buf[ypitch * row + col];
-          if (((row / 8) + (col / 8)) % 2 == 0) {
+          if (((row / blockSize) + (col / blockSize)) % 2 == 0) {
             if (row < halfway)
               checkVal255(row, col, y, "Y");
             else
@@ -503,12 +574,12 @@
         }
       }
       if (subsamp != TJ.SAMP_GRAY) {
-        halfway = 16 / vsf;
+        halfway = 16 / vsf * sf.getNum() / sf.getDenom();
         for (row = 0; row < ch; row++) {
           for (col = 0; col < cw; col++) {
             byte u = buf[ypitch * ph + (uvpitch * row + col)],
                  v = buf[ypitch * ph + uvpitch * ch + (uvpitch * row + col)];
-            if (((row * vsf / 8) + (col * hsf / 8)) % 2 == 0) {
+            if (((row * vsf / blockSize) + (col * hsf / blockSize)) % 2 == 0) {
               checkVal(row, col, u, "U", 128);
               checkVal(row, col, v, "V", 128);
             } else {
@@ -615,6 +686,7 @@
     t = getTime();
     tjc.setSubsamp(subsamp);
     tjc.setJPEGQuality(jpegQual);
+    tjc.setYUVPad(pad);
     if (bi) {
       if (yuv == YUVENCODE)
         tjc.encodeYUV(img, dstBuf, flags);
@@ -641,7 +713,8 @@
     writeJPEG(dstBuf, size, tempstr);
 
     if (yuv == YUVENCODE) {
-      if (checkBufYUV(dstBuf, size, w, h, subsamp) == 1)
+      if (checkBufYUV(dstBuf, size, w, h, subsamp,
+                      new TJScalingFactor(1, 1)) == 1)
         System.out.print("Passed.");
       else {
         System.out.print("FAILED!");
@@ -677,7 +750,7 @@
 
     System.out.print("JPEG -> ");
     if (yuv == YUVDECODE)
-      System.out.print("YUV " + subNameLong[subsamp] + " ... ");
+      System.out.print("YUV " + subNameLong[subsamp] + " ");
     else {
       System.out.print(pfStr + " ");
       if (bi)
@@ -686,11 +759,11 @@
         System.out.print("Bottom-Up ");
       else
         System.out.print("Top-Down  ");
-      if (!sf.isOne())
-        System.out.print(sf.getNum() + "/" + sf.getDenom() + " ... ");
-      else
-        System.out.print("... ");
     }
+    if (!sf.isOne())
+      System.out.print(sf.getNum() + "/" + sf.getDenom() + " ... ");
+    else
+      System.out.print("... ");
 
     t = getTime();
     tjd.setJPEGImage(jpegBuf, jpegSize);
@@ -706,7 +779,7 @@
       throw new Exception("Scaled size mismatch");
 
     if (yuv == YUVDECODE)
-      dstBuf = tjd.decompressToYUV(flags);
+      dstBuf = tjd.decompressToYUV(scaledWidth, pad, scaledHeight, flags);
     else {
       if (bi)
         img = tjd.decompress(scaledWidth, scaledHeight, imgType, flags);
@@ -725,7 +798,8 @@
     }
 
     if (yuv == YUVDECODE) {
-      if (checkBufYUV(dstBuf, dstBuf.length, w, h, subsamp) == 1)
+      if (checkBufYUV(dstBuf, dstBuf.length, scaledWidth, scaledHeight,
+                      subsamp, sf) == 1)
         System.out.print("Passed.");
       else {
         System.out.print("FAILED!");  exitStatus = -1;
@@ -749,14 +823,18 @@
                                  String baseName, int subsamp,
                                  int flags) throws Exception {
     int i;
-    if ((subsamp == TJ.SAMP_444 || subsamp == TJ.SAMP_GRAY) && yuv == 0) {
-      TJScalingFactor[] sf = TJ.getScalingFactors();
-      for (i = 0; i < sf.length; i++)
+    TJScalingFactor[] sf = TJ.getScalingFactors();
+    for (i = 0; i < sf.length; i++) {
+      int num = sf[i].getNum();
+      int denom = sf[i].getDenom();
+      if (subsamp == TJ.SAMP_444 || subsamp == TJ.SAMP_GRAY ||
+          (subsamp == TJ.SAMP_411 && num == 1 &&
+           (denom == 2 || denom == 1)) ||
+          (subsamp != TJ.SAMP_411 && num == 1 &&
+           (denom == 4 || denom == 2 || denom == 1)))
         decompTest(tjd, jpegBuf, jpegSize, w, h, pf, baseName, subsamp,
                    flags, sf[i]);
-    } else
-      decompTest(tjd, jpegBuf, jpegSize, w, h, pf, baseName, subsamp,
-                 flags, new TJScalingFactor(1, 1));
+    }
   }
 
   private static void doTest(int w, int h, int[] formats, int subsamp,
@@ -767,7 +845,7 @@
     byte[] dstBuf;
 
     if (yuv == YUVENCODE)
-      dstBuf = new byte[TJ.bufSizeYUV(w, h, subsamp)];
+      dstBuf = new byte[TJ.bufSizeYUV(w, pad, h, subsamp)];
     else
       dstBuf = new byte[TJ.bufSize(w, h, subsamp)];
 
@@ -776,10 +854,11 @@
       tjd = new TJDecompressor();
 
       for (int pf : formats) {
+        if (pf < 0) continue;
         for (int i = 0; i < 2; i++) {
           int flags = 0;
           if (subsamp == TJ.SAMP_422 || subsamp == TJ.SAMP_420 ||
-              subsamp == TJ.SAMP_440)
+              subsamp == TJ.SAMP_440 || subsamp == TJ.SAMP_411)
             flags |= TJ.FLAG_FASTUPSAMPLE;
           if (i == 1) {
             if (yuv == YUVDECODE) {
@@ -858,6 +937,8 @@
       for (int i = 0; i < argv.length; i++) {
         if (argv[i].equalsIgnoreCase("-yuv"))
           doyuv = true;
+        if (argv[i].equalsIgnoreCase("-noyuvpad"))
+          pad = 1;
         if (argv[i].substring(0, 1).equalsIgnoreCase("-h") ||
             argv[i].equalsIgnoreCase("-?"))
           usage();
@@ -866,7 +947,10 @@
           testName = "javabitest";
         }
       }
-      if (doyuv) yuv = YUVENCODE;
+      if (doyuv) {
+        yuv = YUVENCODE;
+        _4byteFormats[4] = -1;
+      }
       doTest(35, 39, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_444,
              testName);
       doTest(39, 41, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_444,
@@ -883,10 +967,15 @@
              testName);
       doTest(39, 41, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_440,
              testName);
-      doTest(35, 39, bi ? onlyGrayBI : onlyGray, TJ.SAMP_GRAY, testName);
-      doTest(39, 41, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_GRAY,
+      doTest(41, 35, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_411,
              testName);
-      doTest(41, 35, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_GRAY,
+      doTest(35, 39, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_411,
+             testName);
+      doTest(39, 41, bi ? onlyGrayBI : onlyGray, TJ.SAMP_GRAY, testName);
+      doTest(41, 35, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_GRAY,
+             testName);
+      _4byteFormats[4] = -1;
+      doTest(35, 39, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_GRAY,
              testName);
       if (!doyuv && !bi)
         bufSizeTest();
@@ -900,10 +989,12 @@
         doTest(41, 35, onlyRGB, TJ.SAMP_420, "javatest_yuv1");
         doTest(48, 48, onlyRGB, TJ.SAMP_440, "javatest_yuv0");
         doTest(35, 39, onlyRGB, TJ.SAMP_440, "javatest_yuv1");
+        doTest(48, 48, onlyRGB, TJ.SAMP_411, "javatest_yuv0");
+        doTest(39, 41, onlyRGB, TJ.SAMP_411, "javatest_yuv1");
         doTest(48, 48, onlyRGB, TJ.SAMP_GRAY, "javatest_yuv0");
-        doTest(35, 39, onlyRGB, TJ.SAMP_GRAY, "javatest_yuv1");
+        doTest(41, 35, onlyRGB, TJ.SAMP_GRAY, "javatest_yuv1");
         doTest(48, 48, onlyGray, TJ.SAMP_GRAY, "javatest_yuv0");
-        doTest(39, 41, onlyGray, TJ.SAMP_GRAY, "javatest_yuv1");
+        doTest(35, 39, onlyGray, TJ.SAMP_GRAY, "javatest_yuv1");
       }
     } catch(Exception e) {
       e.printStackTrace();
diff --git a/java/doc/constant-values.html b/java/doc/constant-values.html
index e4adb67..01f950f 100644
--- a/java/doc/constant-values.html
+++ b/java/doc/constant-values.html
@@ -99,6 +99,36 @@
 <TH ALIGN="left" COLSPAN="3">org.libjpegturbo.turbojpeg.<A HREF="org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A></TH>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_CMYK"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_CMYK">CS_CMYK</A></CODE></TD>
+<TD ALIGN="right"><CODE>3</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_GRAY"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_GRAY">CS_GRAY</A></CODE></TD>
+<TD ALIGN="right"><CODE>2</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_RGB"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_RGB">CS_RGB</A></CODE></TD>
+<TD ALIGN="right"><CODE>0</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_YCbCr"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr">CS_YCbCr</A></CODE></TD>
+<TD ALIGN="right"><CODE>1</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_YCCK"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK">CS_YCCK</A></CODE></TD>
+<TD ALIGN="right"><CODE>4</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.FLAG_ACCURATEDCT"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#FLAG_ACCURATEDCT">FLAG_ACCURATEDCT</A></CODE></TD>
@@ -147,16 +177,22 @@
 <TD ALIGN="right"><CODE>128</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.NUMCS"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#NUMCS">NUMCS</A></CODE></TD>
+<TD ALIGN="right"><CODE>5</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.NUMPF"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#NUMPF">NUMPF</A></CODE></TD>
-<TD ALIGN="right"><CODE>11</CODE></TD>
+<TD ALIGN="right"><CODE>12</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.NUMSAMP"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#NUMSAMP">NUMSAMP</A></CODE></TD>
-<TD ALIGN="right"><CODE>5</CODE></TD>
+<TD ALIGN="right"><CODE>6</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.PF_ABGR"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
@@ -189,6 +225,12 @@
 <TD ALIGN="right"><CODE>3</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.PF_CMYK"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#PF_CMYK">PF_CMYK</A></CODE></TD>
+<TD ALIGN="right"><CODE>11</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.PF_GRAY"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#PF_GRAY">PF_GRAY</A></CODE></TD>
@@ -225,6 +267,12 @@
 <TD ALIGN="right"><CODE>5</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.SAMP_411"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#SAMP_411">SAMP_411</A></CODE></TD>
+<TD ALIGN="right"><CODE>5</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.SAMP_420"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#SAMP_420">SAMP_420</A></CODE></TD>
diff --git a/java/doc/deprecated-list.html b/java/doc/deprecated-list.html
index 37ca515..2430387 100644
--- a/java/doc/deprecated-list.html
+++ b/java/doc/deprecated-list.html
@@ -91,12 +91,28 @@
 <B>Deprecated Methods</B></FONT></TH>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)">org.libjpegturbo.turbojpeg.TJ.bufSizeYUV(int, int, int)</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use <A HREF="org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> instead.</I>&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <TD><A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int)">org.libjpegturbo.turbojpeg.TJDecompressor.decompress(byte[], int, int, int, int, int)</A>
 <BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use
  <A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int, int, int)"><CODE>TJDecompressor.decompress(byte[], int, int, int, int, int, int, int)</CODE></A> instead.</I>&nbsp;</TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)">org.libjpegturbo.turbojpeg.TJDecompressor.decompressToYUV(byte[], int)</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use <A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>TJDecompressor.decompressToYUV(byte[], int, int, int, int)</CODE></A>
+ instead.</I>&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)">org.libjpegturbo.turbojpeg.TJDecompressor.decompressToYUV(int)</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use <A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><CODE>TJDecompressor.decompressToYUV(int, int, int, int)</CODE></A> instead.</I>&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <TD><A HREF="org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[], int, int, int, int)">org.libjpegturbo.turbojpeg.TJCompressor.setSourceImage(byte[], int, int, int, int)</A>
 <BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use
diff --git a/java/doc/index-all.html b/java/doc/index-all.html
index a534d43..4555379 100644
--- a/java/doc/index-all.html
+++ b/java/doc/index-all.html
@@ -82,10 +82,13 @@
 Static method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>Returns the maximum size of the buffer (in bytes) required to hold a JPEG
  image with the given width, height, and level of chrominance subsampling.
-<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><B>bufSizeYUV(int, int, int)</B></A> - 
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><B>bufSizeYUV(int, int, int, int)</B></A> - 
 Static method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>Returns the size of the buffer (in bytes) required to hold a YUV planar
  image with the given width, height, and level of chrominance subsampling.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><B>bufSizeYUV(int, int, int)</B></A> - 
+Static method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="./org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> instead.</I>
 </DL>
 <HR>
 <A NAME="_C_"><!-- --></A><H2>
@@ -116,11 +119,26 @@
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
 <DD>Compress the uncompressed source image stored in <code>srcImage</code>
  and return a buffer containing a JPEG image.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_CMYK"><B>CS_CMYK</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>CMYK colorspace.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_GRAY"><B>CS_GRAY</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>Grayscale colorspace.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_RGB"><B>CS_RGB</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>RGB colorspace.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr"><B>CS_YCbCr</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>YCbCr colorspace.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK"><B>CS_YCCK</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>YCCK colorspace.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJCustomFilter.html#customFilter(java.nio.ShortBuffer, java.awt.Rectangle, java.awt.Rectangle, int, int, org.libjpegturbo.turbojpeg.TJTransform)"><B>customFilter(ShortBuffer, Rectangle, Rectangle, int, int, TJTransform)</B></A> - 
 Method in interface org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg">TJCustomFilter</A>
 <DD>A callback function that can be used to modify the DCT coefficients after
  they are losslessly transformed but before they are transcoded to a new
- JPEG file.
+ JPEG image.
 </DL>
 <HR>
 <A NAME="_D_"><!-- --></A><H2>
@@ -152,14 +170,21 @@
 <DD>Decompress the JPEG source image associated with this decompressor
  instance and return a <code>BufferedImage</code> instance containing the
  decompressed image.
-<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)"><B>decompressToYUV(byte[], int)</B></A> - 
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><B>decompressToYUV(byte[], int, int, int, int)</B></A> - 
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
 <DD>Decompress the JPEG source image associated with this decompressor
  instance and output a YUV planar image to the given destination buffer.
-<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)"><B>decompressToYUV(int)</B></A> - 
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)"><B>decompressToYUV(byte[], int)</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>TJDecompressor.decompressToYUV(byte[], int, int, int, int)</CODE></A>
+ instead.</I>
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><B>decompressToYUV(int, int, int, int)</B></A> - 
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
 <DD>Decompress the JPEG source image associated with this decompressor
  instance and return a buffer containing a YUV planar image.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)"><B>decompressToYUV(int)</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><CODE>TJDecompressor.decompressToYUV(int, int, int, int)</CODE></A> instead.</I>
 </DL>
 <HR>
 <A NAME="_E_"><!-- --></A><H2>
@@ -237,6 +262,10 @@
 Static method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>For the given pixel format, returns the number of bytes that the blue
  component is offset from the start of the pixel.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#getColorspace()"><B>getColorspace()</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
+<DD>Returns the colorspace used in the JPEG image associated with this
+ decompressor instance.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html#getCompressedSize()"><B>getCompressedSize()</B></A> - 
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
 <DD>Returns the size of the image (in bytes) generated by the most recent
@@ -334,6 +363,9 @@
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBufSize"><B>jpegBufSize</B></A> - 
 Variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
 <DD>&nbsp;
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegColorspace"><B>jpegColorspace</B></A> - 
+Variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
+<DD>&nbsp;
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegHeight"><B>jpegHeight</B></A> - 
 Variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
 <DD>&nbsp;
@@ -348,6 +380,9 @@
 <A NAME="_N_"><!-- --></A><H2>
 <B>N</B></H2>
 <DL>
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#NUMCS"><B>NUMCS</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>The number of JPEG colorspaces
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJTransform.html#NUMOP"><B>NUMOP</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</A>
 <DD>The number of lossless transform operations
@@ -432,6 +467,9 @@
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#PF_BGRX"><B>PF_BGRX</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>BGRX pixel format.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#PF_CMYK"><B>PF_CMYK</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>CMYK pixel format.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#PF_GRAY"><B>PF_GRAY</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>Grayscale pixel format.
@@ -455,6 +493,9 @@
 <A NAME="_S_"><!-- --></A><H2>
 <B>S</B></H2>
 <DL>
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#SAMP_411"><B>SAMP_411</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>4:1:1 chrominance subsampling.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#SAMP_420"><B>SAMP_420</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>4:2:0 chrominance subsampling.
@@ -488,6 +529,9 @@
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
 <DD>Set the level of chrominance subsampling for subsequent compress/encode
  operations.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html#setYUVPad(int)"><B>setYUVPad(int)</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
+<DD>Set the plane padding for subsequent YUV encode operations.
 </DL>
 <HR>
 <A NAME="_T_"><!-- --></A><H2>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJ.html b/java/doc/org/libjpegturbo/turbojpeg/TJ.html
index f905406..3a94cba 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJ.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJ.html
@@ -115,6 +115,46 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_CMYK">CS_CMYK</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;CMYK colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_GRAY">CS_GRAY</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Grayscale colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_RGB">CS_RGB</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;RGB colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr">CS_YCbCr</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;YCbCr colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK">CS_YCCK</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;YCCK colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_ACCURATEDCT">FLAG_ACCURATEDCT</A></B></CODE>
 
 <BR>
@@ -187,6 +227,14 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#NUMCS">NUMCS</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;The number of JPEG colorspaces</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#NUMPF">NUMPF</A></B></CODE>
 
 <BR>
@@ -243,6 +291,14 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_CMYK">PF_CMYK</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;CMYK pixel format.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_GRAY">PF_GRAY</A></B></CODE>
 
 <BR>
@@ -291,6 +347,14 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_411">SAMP_411</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4:1:1 chrominance subsampling.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_420">SAMP_420</A></B></CODE>
 
 <BR>
@@ -373,6 +437,17 @@
            int&nbsp;subsamp)</CODE>
 
 <BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>bufSizeYUV(int, int, int, int)</CODE></A> instead.</I></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)">bufSizeYUV</A></B>(int&nbsp;width,
+           int&nbsp;pad,
+           int&nbsp;height,
+           int&nbsp;subsamp)</CODE>
+
+<BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Returns the size of the buffer (in bytes) required to hold a YUV planar
  image with the given width, height, and level of chrominance subsampling.</TD>
 </TR>
@@ -539,6 +614,25 @@
 </DL>
 <HR>
 
+<A NAME="SAMP_411"><!-- --></A><H3>
+SAMP_411</H3>
+<PRE>
+public static final int <B>SAMP_411</B></PRE>
+<DL>
+<DD>4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+ chrominance component for every 4x1 block of pixels in the source image.
+ JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+ same size as those compressed with 4:2:0 subsampling, and in the
+ aggregate, both subsampling methods produce approximately the same
+ perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+ horizontal features.  Note that 4:1:1 subsampling is not fully accelerated
+ in libjpeg-turbo.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.SAMP_411">Constant Field Values</A></DL>
+</DL>
+<HR>
+
 <A NAME="NUMPF"><!-- --></A><H3>
 NUMPF</H3>
 <PRE>
@@ -708,6 +802,131 @@
 </DL>
 <HR>
 
+<A NAME="PF_CMYK"><!-- --></A><H3>
+PF_CMYK</H3>
+<PRE>
+public static final int <B>PF_CMYK</B></PRE>
+<DL>
+<DD>CMYK pixel format.  Unlike RGB, which is an additive color model used
+ primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+ color model used primarily for printing.  In the CMYK color model, the
+ value of each color component typically corresponds to an amount of cyan,
+ magenta, yellow, or black ink that is applied to a white background.  In
+ order to convert between CMYK and RGB, it is necessary to use a color
+ management system (CMS.)  A CMS will attempt to map colors within the
+ printer's gamut to perceptually similar colors in the display's gamut and
+ vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+ be defined with a simple formula.  Thus, such a conversion is out of scope
+ for a codec library.  However, the TurboJPEG API allows for compressing
+ CMYK pixels into a YCCK JPEG image (see <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK"><CODE>CS_YCCK</CODE></A>) and
+ decompressing YCCK JPEG images into CMYK pixels.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.PF_CMYK">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="NUMCS"><!-- --></A><H3>
+NUMCS</H3>
+<PRE>
+public static final int <B>NUMCS</B></PRE>
+<DL>
+<DD>The number of JPEG colorspaces
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.NUMCS">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_RGB"><!-- --></A><H3>
+CS_RGB</H3>
+<PRE>
+public static final int <B>CS_RGB</B></PRE>
+<DL>
+<DD>RGB colorspace.  When compressing the JPEG image, the R, G, and B
+ components in the source image are reordered into image planes, but no
+ colorspace conversion or subsampling is performed.  RGB JPEG images can be
+ decompressed to any of the extended RGB pixel formats or grayscale, but
+ they cannot be decompressed to YUV images.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_RGB">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_YCbCr"><!-- --></A><H3>
+CS_YCbCr</H3>
+<PRE>
+public static final int <B>CS_YCbCr</B></PRE>
+<DL>
+<DD>YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+ mathematical transformation of RGB designed solely for storage and
+ transmission.  YCbCr images must be converted to RGB before they can
+ actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+ component represents the black & white portion of the original image, and
+ the Cb and Cr (chrominance) components represent the color portion of the
+ original image.  Originally, the analog equivalent of this transformation
+ allowed the same signal to drive both black & white and color televisions,
+ but JPEG images use YCbCr primarily because it allows the color data to be
+ optionally subsampled for the purposes of reducing bandwidth or disk
+ space.  YCbCr is the most common JPEG colorspace, and YCbCr JPEG images
+ can be compressed from and decompressed to any of the extended RGB pixel
+ formats or grayscale, or they can be decompressed to YUV planar images.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_YCbCr">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_GRAY"><!-- --></A><H3>
+CS_GRAY</H3>
+<PRE>
+public static final int <B>CS_GRAY</B></PRE>
+<DL>
+<DD>Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+ component), and any color data from the source image is discarded.
+ Grayscale JPEG images can be compressed from and decompressed to any of
+ the extended RGB pixel formats or grayscale, or they can be decompressed
+ to YUV planar images.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_GRAY">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_CMYK"><!-- --></A><H3>
+CS_CMYK</H3>
+<PRE>
+public static final int <B>CS_CMYK</B></PRE>
+<DL>
+<DD>CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+ components in the source image are reordered into image planes, but no
+ colorspace conversion or subsampling is performed.  CMYK JPEG images can
+ only be decompressed to CMYK pixels.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_CMYK">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_YCCK"><!-- --></A><H3>
+CS_YCCK</H3>
+<PRE>
+public static final int <B>CS_YCCK</B></PRE>
+<DL>
+<DD>YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+ rather a mathematical transformation of CMYK designed solely for storage
+ and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+ reversibly transformed into YCCK, and as with YCbCr, the chrominance
+ components in the YCCK pixels can be subsampled without incurring major
+ perceptual loss.  YCCK JPEG images can only be compressed from and
+ decompressed to CMYK pixels.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_YCCK">Constant Field Values</A></DL>
+</DL>
+<HR>
+
 <A NAME="FLAG_BOTTOMUP"><!-- --></A><H3>
 FLAG_BOTTOMUP</H3>
 <PRE>
@@ -991,10 +1210,11 @@
 </DL>
 <HR>
 
-<A NAME="bufSizeYUV(int, int, int)"><!-- --></A><H3>
+<A NAME="bufSizeYUV(int, int, int, int)"><!-- --></A><H3>
 bufSizeYUV</H3>
 <PRE>
 public static int <B>bufSizeYUV</B>(int&nbsp;width,
+                             int&nbsp;pad,
                              int&nbsp;height,
                              int&nbsp;subsamp)
                       throws java.lang.Exception</PRE>
@@ -1003,7 +1223,9 @@
  image with the given width, height, and level of chrominance subsampling.
 <P>
 <DD><DL>
-<DT><B>Parameters:</B><DD><CODE>width</CODE> - the width (in pixels) of the YUV image<DD><CODE>height</CODE> - the height (in pixels) of the YUV image<DD><CODE>subsamp</CODE> - the level of chrominance subsampling used in the YUV
+<DT><B>Parameters:</B><DD><CODE>width</CODE> - the width (in pixels) of the YUV image<DD><CODE>pad</CODE> - the width of each line in each plane of the image is padded to
+        the nearest multiple of this number of bytes (must be a power of
+        2.)<DD><CODE>height</CODE> - the height (in pixels) of the YUV image<DD><CODE>subsamp</CODE> - the level of chrominance subsampling used in the YUV
  image (one of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.SAMP_*</CODE></A>)
 <DT><B>Returns:</B><DD>the size of the buffer (in bytes) required to hold a YUV planar
  image with the given width, height, and level of chrominance subsampling
@@ -1013,6 +1235,25 @@
 </DL>
 <HR>
 
+<A NAME="bufSizeYUV(int, int, int)"><!-- --></A><H3>
+bufSizeYUV</H3>
+<PRE>
+<FONT SIZE="-1">@Deprecated
+</FONT>public static int <B>bufSizeYUV</B>(int&nbsp;width,
+                                        int&nbsp;height,
+                                        int&nbsp;subsamp)
+                      throws java.lang.Exception</PRE>
+<DL>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>bufSizeYUV(int, int, int, int)</CODE></A> instead.</I>
+<P>
+<DD><DL>
+
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
 <A NAME="getScalingFactors()"><!-- --></A><H3>
 getScalingFactors</H3>
 <PRE>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html b/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
index bc3d67e..a41ea06 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
@@ -304,6 +304,14 @@
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Set the level of chrominance subsampling for subsequent compress/encode
  operations.</TD>
 </TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;void</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setYUVPad(int)">setYUVPad</A></B>(int&nbsp;pad)</CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Set the plane padding for subsequent YUV encode operations.</TD>
+</TR>
 </TABLE>
 &nbsp;<A NAME="methods_inherited_from_class_java.lang.Object"><!-- --></A>
 <TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
@@ -412,8 +420,8 @@
 <DD>Associate an uncompressed source image with this compressor instance.
 <P>
 <DD><DL>
-<DT><B>Parameters:</B><DD><CODE>srcImage</CODE> - image buffer containing RGB or grayscale pixels to be
- compressed<DD><CODE>x</CODE> - x offset (in pixels) of the region from which the JPEG image
+<DT><B>Parameters:</B><DD><CODE>srcImage</CODE> - image buffer containing RGB, grayscale, or CMYK pixels to
+ be compressed<DD><CODE>x</CODE> - x offset (in pixels) of the region from which the JPEG image
  should be compressed, relative to the start of <code>srcImage</code>.<DD><CODE>y</CODE> - y offset (in pixels) of the region from which the JPEG image
  should be compressed, relative to the start of <code>srcImage</code>.<DD><CODE>width</CODE> - width (in pixels) of the region in the source image from
  which the JPEG image should be compressed.<DD><CODE>pitch</CODE> - bytes per line of the source image.  Normally, this should be
@@ -435,11 +443,12 @@
 <A NAME="setSourceImage(byte[], int, int, int, int)"><!-- --></A><H3>
 setSourceImage</H3>
 <PRE>
-public void <B>setSourceImage</B>(byte[]&nbsp;srcImage,
-                           int&nbsp;width,
-                           int&nbsp;pitch,
-                           int&nbsp;height,
-                           int&nbsp;pixelFormat)
+<FONT SIZE="-1">@Deprecated
+</FONT>public void <B>setSourceImage</B>(byte[]&nbsp;srcImage,
+                                      int&nbsp;width,
+                                      int&nbsp;pitch,
+                                      int&nbsp;height,
+                                      int&nbsp;pixelFormat)
                     throws java.lang.Exception</PRE>
 <DL>
 <DD><B>Deprecated.</B>&nbsp;<I>Use
@@ -460,7 +469,13 @@
                 throws java.lang.Exception</PRE>
 <DL>
 <DD>Set the level of chrominance subsampling for subsequent compress/encode
- operations.
+ operations.  When pixels are converted from RGB to YCbCr (see
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr"><CODE>TJ.CS_YCbCr</CODE></A>) or from CMYK to YCCK (see <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK"><CODE>TJ.CS_YCCK</CODE></A>) as part
+ of the JPEG compression process, some of the Cb and Cr (chrominance)
+ components can be discarded or averaged together to produce a smaller
+ image with little perceptible loss of image clarity (the human eye is more
+ sensitive to small changes in brightness than to small changes in color.)
+ This is called "chrominance subsampling".
 <P>
 <DD><DL>
 <DT><B>Parameters:</B><DD><CODE>newSubsamp</CODE> - the new level of chrominance subsampling (one of
@@ -570,6 +585,25 @@
 </DL>
 <HR>
 
+<A NAME="setYUVPad(int)"><!-- --></A><H3>
+setYUVPad</H3>
+<PRE>
+public void <B>setYUVPad</B>(int&nbsp;pad)
+               throws java.lang.Exception</PRE>
+<DL>
+<DD>Set the plane padding for subsequent YUV encode operations.
+<P>
+<DD><DL>
+<DT><B>Parameters:</B><DD><CODE>pad</CODE> - the width of each line in each plane of the YUV image will be
+        padded to the nearest multiple of this number of bytes (must be a
+        power of 2.)  The default padding is 4 bytes, which generates
+        images suitable for direct video display.
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
 <A NAME="encodeYUV(byte[], int)"><!-- --></A><H3>
 encodeYUV</H3>
 <PRE>
@@ -579,21 +613,24 @@
 <DL>
 <DD>Encode the uncompressed source image associated with this compressor
  instance and output a YUV planar image to the given destination buffer.
- This method uses the accelerated color conversion routines in
- TurboJPEG's underlying codec to produce a planar YUV image that is
- suitable for direct video display.  Specifically, if the chrominance
- components are subsampled along the horizontal dimension, then the width
- of the luminance plane is padded to the nearest multiple of 2 in the
- output image (same goes for the height of the luminance plane, if the
+ This method uses the accelerated color conversion routines in TurboJPEG's
+ underlying codec but does not execute any of the other steps in the JPEG
+ compression process.  The Y, U (Cb), and V (Cr) image planes are stored
+ sequentially into the destination buffer, and the size of each plane is
+ determined by the width and height of the source image, as well as the
+ specified padding and level of chrominance subsampling.  If the
+ chrominance components are subsampled along the horizontal dimension, then
+ the width of the luminance plane is padded to the nearest multiple of 2 in
+ the output image (same goes for the height of the luminance plane, if the
  chrominance components are subsampled along the vertical dimension.)
- Also, each line of each plane in the output image is padded to 4 bytes.
- Although this will work with any subsampling option, it is really only
- useful in combination with <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_420"><CODE>TJ.SAMP_420</CODE></A>, which produces an image
- compatible with the I420 (AKA "YUV420P") format.
+ <p>
+ NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
+ convention of the digital video community, the TurboJPEG API uses "YUV" to
+ refer to an image format consisting of Y, Cb, and Cr image planes.
 <P>
 <DD><DL>
 <DT><B>Parameters:</B><DD><CODE>dstBuf</CODE> - buffer that will receive the YUV planar image.  Use
- <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int)</CODE></A> to determine the appropriate size for this buffer
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> to determine the appropriate size for this buffer
  based on the image width, height, and level of chrominance subsampling.<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
 <DT><B>Throws:</B>
 <DD><CODE>java.lang.Exception</CODE></DL>
@@ -635,7 +672,7 @@
 <DD><DL>
 <DT><B>Parameters:</B><DD><CODE>srcImage</CODE> - a <code>BufferedImage</code> instance containing RGB or
  grayscale pixels to be encoded<DD><CODE>dstBuf</CODE> - buffer that will receive the YUV planar image.  Use
- <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int)</CODE></A> to determine the appropriate size for this buffer
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> to determine the appropriate size for this buffer
  based on the image width, height, and level of chrominance subsampling.<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
 <DT><B>Throws:</B>
 <DD><CODE>java.lang.Exception</CODE></DL>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html b/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
index 4cb2615..d4f6bff 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
@@ -122,7 +122,7 @@
 <BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;A callback function that can be used to modify the DCT coefficients after
  they are losslessly transformed but before they are transcoded to a new
- JPEG file.</TD>
+ JPEG image.</TD>
 </TR>
 </TABLE>
 &nbsp;
@@ -151,7 +151,7 @@
 <DL>
 <DD>A callback function that can be used to modify the DCT coefficients after
  they are losslessly transformed but before they are transcoded to a new
- JPEG file.  This allows for custom filters or other transformations to be
+ JPEG image.  This allows for custom filters or other transformations to be
  applied in the frequency domain.
 <P>
 <DD><DL>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html b/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
index 832e474..2dc3cc6 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
@@ -142,6 +142,14 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>protected &nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegColorspace">jpegColorspace</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>protected &nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegHeight">jpegHeight</A></B></CODE>
 
 <BR>
@@ -303,6 +311,19 @@
                 int&nbsp;flags)</CODE>
 
 <BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A>
+ instead.</I></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;void</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)">decompressToYUV</A></B>(byte[]&nbsp;dstBuf,
+                int&nbsp;desiredWidth,
+                int&nbsp;pad,
+                int&nbsp;desiredHeight,
+                int&nbsp;flags)</CODE>
+
+<BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Decompress the JPEG source image associated with this decompressor
  instance and output a YUV planar image to the given destination buffer.</TD>
 </TR>
@@ -312,6 +333,17 @@
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)">decompressToYUV</A></B>(int&nbsp;flags)</CODE>
 
 <BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><CODE>decompressToYUV(int, int, int, int)</CODE></A> instead.</I></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;byte[]</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)">decompressToYUV</A></B>(int&nbsp;desiredWidth,
+                int&nbsp;pad,
+                int&nbsp;desiredHeight,
+                int&nbsp;flags)</CODE>
+
+<BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Decompress the JPEG source image associated with this decompressor
  instance and return a buffer containing a YUV planar image.</TD>
 </TR>
@@ -326,6 +358,15 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getColorspace()">getColorspace</A></B>()</CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Returns the colorspace used in the JPEG image associated with this
+ decompressor instance.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getHeight()">getHeight</A></B>()</CODE>
 
 <BR>
@@ -480,6 +521,16 @@
 <DL>
 </DL>
 </DL>
+<HR>
+
+<A NAME="jpegColorspace"><!-- --></A><H3>
+jpegColorspace</H3>
+<PRE>
+protected int <B>jpegColorspace</B></PRE>
+<DL>
+<DL>
+</DL>
+</DL>
 
 <!-- ========= CONSTRUCTOR DETAIL ======== -->
 
@@ -614,7 +665,7 @@
                throws java.lang.Exception</PRE>
 <DL>
 <DD>Returns the level of chrominance subsampling used in the JPEG image
- associated with this decompressor instance.
+ associated with this decompressor instance.  See <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.SAMP_*</CODE></A>.
 <P>
 <DD><DL>
 
@@ -626,6 +677,25 @@
 </DL>
 <HR>
 
+<A NAME="getColorspace()"><!-- --></A><H3>
+getColorspace</H3>
+<PRE>
+public int <B>getColorspace</B>()
+                  throws java.lang.Exception</PRE>
+<DL>
+<DD>Returns the colorspace used in the JPEG image associated with this
+ decompressor instance.  See <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.CS_*</CODE></A>.
+<P>
+<DD><DL>
+
+<DT><B>Returns:</B><DD>the colorspace used in the JPEG image associated with this
+ decompressor instance
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
 <A NAME="getJPEGBuf()"><!-- --></A><H3>
 getJPEGBuf</H3>
 <PRE>
@@ -777,12 +847,13 @@
 <A NAME="decompress(byte[], int, int, int, int, int)"><!-- --></A><H3>
 decompress</H3>
 <PRE>
-public void <B>decompress</B>(byte[]&nbsp;dstBuf,
-                       int&nbsp;desiredWidth,
-                       int&nbsp;pitch,
-                       int&nbsp;desiredHeight,
-                       int&nbsp;pixelFormat,
-                       int&nbsp;flags)
+<FONT SIZE="-1">@Deprecated
+</FONT>public void <B>decompress</B>(byte[]&nbsp;dstBuf,
+                                  int&nbsp;desiredWidth,
+                                  int&nbsp;pitch,
+                                  int&nbsp;desiredHeight,
+                                  int&nbsp;pixelFormat,
+                                  int&nbsp;flags)
                 throws java.lang.Exception</PRE>
 <DL>
 <DD><B>Deprecated.</B>&nbsp;<I>Use
@@ -825,10 +896,13 @@
 </DL>
 <HR>
 
-<A NAME="decompressToYUV(byte[], int)"><!-- --></A><H3>
+<A NAME="decompressToYUV(byte[], int, int, int, int)"><!-- --></A><H3>
 decompressToYUV</H3>
 <PRE>
 public void <B>decompressToYUV</B>(byte[]&nbsp;dstBuf,
+                            int&nbsp;desiredWidth,
+                            int&nbsp;pad,
+                            int&nbsp;desiredHeight,
                             int&nbsp;flags)
                      throws java.lang.Exception</PRE>
 <DL>
@@ -841,11 +915,71 @@
  that, if the width or height of the image is not an even multiple of the
  MCU block size (see <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#getMCUWidth(int)"><CODE>TJ.getMCUWidth(int)</CODE></A> and <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#getMCUHeight(int)"><CODE>TJ.getMCUHeight(int)</CODE></A>),
  then an intermediate buffer copy will be performed within TurboJPEG.
+ <p>
+ NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
+ convention of the digital video community, the TurboJPEG API uses "YUV" to
+ refer to an image format consisting of Y, Cb, and Cr image planes.
 <P>
 <DD><DL>
 <DT><B>Parameters:</B><DD><CODE>dstBuf</CODE> - buffer that will receive the YUV planar image.  Use
- <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int)</CODE></A> to determine the appropriate size for this buffer
- based on the image width, height, and level of chrominance subsampling.<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> to determine the appropriate size for this buffer
+ based on the image width, height, and level of chrominance subsampling.<DD><CODE>desiredWidth</CODE> - desired width (in pixels) of the YUV image.  If the
+ desired image dimensions are different than the dimensions of the JPEG
+ image being decompressed, then TurboJPEG will use scaling in the JPEG
+ decompressor to generate the largest possible image that will fit within
+ the desired dimensions.  Setting this to 0 is the same as setting it to
+ the width of the JPEG image (in other words, the width will not be
+ considered when determining the scaled image size.)<DD><CODE>pad</CODE> - the width of each line in each plane of the YUV image will be
+ padded to the nearest multiple of this number of bytes (must be a power of
+ 2.)<DD><CODE>desiredHeight</CODE> - desired height (in pixels) of the YUV image.  If the
+ desired image dimensions are different than the dimensions of the JPEG
+ image being decompressed, then TurboJPEG will use scaling in the JPEG
+ decompressor to generate the largest possible image that will fit within
+ the desired dimensions.  Setting this to 0 is the same as setting it to
+ the height of the JPEG image (in other words, the height will not be
+ considered when determining the scaled image size.)<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
+<A NAME="decompressToYUV(byte[], int)"><!-- --></A><H3>
+decompressToYUV</H3>
+<PRE>
+<FONT SIZE="-1">@Deprecated
+</FONT>public void <B>decompressToYUV</B>(byte[]&nbsp;dstBuf,
+                                       int&nbsp;flags)
+                     throws java.lang.Exception</PRE>
+<DL>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A>
+ instead.</I>
+<P>
+<DD><DL>
+
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
+<A NAME="decompressToYUV(int, int, int, int)"><!-- --></A><H3>
+decompressToYUV</H3>
+<PRE>
+public byte[] <B>decompressToYUV</B>(int&nbsp;desiredWidth,
+                              int&nbsp;pad,
+                              int&nbsp;desiredHeight,
+                              int&nbsp;flags)
+                       throws java.lang.Exception</PRE>
+<DL>
+<DD>Decompress the JPEG source image associated with this decompressor
+ instance and return a buffer containing a YUV planar image.  See <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A> for more detail.
+<P>
+<DD><DL>
+<DT><B>Parameters:</B><DD><CODE>desiredWidth</CODE> - see
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A> for description<DD><CODE>pad</CODE> - see <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A> for
+ description<DD><CODE>desiredHeight</CODE> - see <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A> for description<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
+<DT><B>Returns:</B><DD>a buffer containing a YUV planar image
 <DT><B>Throws:</B>
 <DD><CODE>java.lang.Exception</CODE></DL>
 </DD>
@@ -855,15 +989,14 @@
 <A NAME="decompressToYUV(int)"><!-- --></A><H3>
 decompressToYUV</H3>
 <PRE>
-public byte[] <B>decompressToYUV</B>(int&nbsp;flags)
+<FONT SIZE="-1">@Deprecated
+</FONT>public byte[] <B>decompressToYUV</B>(int&nbsp;flags)
                        throws java.lang.Exception</PRE>
 <DL>
-<DD>Decompress the JPEG source image associated with this decompressor
- instance and return a buffer containing a YUV planar image.  See <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)"><CODE>decompressToYUV(byte[], int)</CODE></A> for more detail.
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><CODE>decompressToYUV(int, int, int, int)</CODE></A> instead.</I>
 <P>
 <DD><DL>
-<DT><B>Parameters:</B><DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
-<DT><B>Returns:</B><DD>a buffer containing a YUV planar image
+
 <DT><B>Throws:</B>
 <DD><CODE>java.lang.Exception</CODE></DL>
 </DD>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html b/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
index 1e76ac8..0811b51 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
@@ -120,7 +120,7 @@
 <TH ALIGN="left"><B>Fields inherited from class org.libjpegturbo.turbojpeg.<A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A></B></TH>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
-<TD><CODE><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#handle">handle</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBuf">jpegBuf</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBufSize">jpegBufSize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegHeight">jpegHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegSubsamp">jpegSubsamp</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegWidth">jpegWidth</A></CODE></TD>
+<TD><CODE><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#handle">handle</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBuf">jpegBuf</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBufSize">jpegBufSize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegColorspace">jpegColorspace</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegHeight">jpegHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegSubsamp">jpegSubsamp</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegWidth">jpegWidth</A></CODE></TD>
 </TR>
 </TABLE>
 &nbsp;
@@ -203,7 +203,7 @@
 <TH ALIGN="left"><B>Methods inherited from class org.libjpegturbo.turbojpeg.<A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A></B></TH>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
-<TD><CODE><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#close()">close</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(java.awt.image.BufferedImage, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int[], int, int, int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#finalize()">finalize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getHeight()">getHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getJPEGBuf()">getJPEGBuf</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getJPEGSize()">getJPEGSize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getScaledHeight(int, int)">getScaledHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getScaledWidth(int, int)">getScaledWidth</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getSubsamp()">getSubsamp</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getWidth()">getWidth</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#setJPEGImage(byte[], int)">setJPEGImage</A></CODE></TD>
+<TD><CODE><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#close()">close</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(java.awt.image.BufferedImage, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int[], int, int, int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#finalize()">finalize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getColorspace()">getColorspace</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getHeight()">getHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getJPEGBuf()">getJPEGBuf</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getJPEGSize()">getJPEGSize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getScaledHeight(int, int)">getScaledHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getScaledWidth(int, int)">getScaledWidth</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getSubsamp()">getSubsamp</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getWidth()">getWidth</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#setJPEGImage(byte[], int)">setJPEGImage</A></CODE></TD>
 </TR>
 </TABLE>
 &nbsp;<A NAME="methods_inherited_from_class_java.lang.Object"><!-- --></A>
diff --git a/java/org/libjpegturbo/turbojpeg/TJ.java b/java/org/libjpegturbo/turbojpeg/TJ.java
index 6c6a95d..9fe3058 100644
--- a/java/org/libjpegturbo/turbojpeg/TJ.java
+++ b/java/org/libjpegturbo/turbojpeg/TJ.java
@@ -37,7 +37,7 @@
   /**
    * The number of chrominance subsampling options
    */
-  public static final int NUMSAMP   = 5;
+  public static final int NUMSAMP   = 6;
   /**
    * 4:4:4 chrominance subsampling (no chrominance subsampling).  The JPEG
    * or YUV image will contain one chrominance component for every pixel in the
@@ -64,6 +64,17 @@
    * Note that 4:4:0 subsampling is not fully accelerated in libjpeg-turbo.
    */
   public static final int SAMP_440  = 4;
+  /**
+   * 4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 4x1 block of pixels in the source image.
+   * JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+   * horizontal features.  Note that 4:1:1 subsampling is not fully accelerated
+   * in libjpeg-turbo.
+   */
+  public static final int SAMP_411  = 5;
 
 
   /**
@@ -82,7 +93,7 @@
   }
 
   private static final int[] mcuWidth = {
-    8, 16, 16, 8, 8
+    8, 16, 16, 8, 8, 32
   };
 
 
@@ -103,14 +114,14 @@
   }
 
   private static final int[] mcuHeight = {
-    8, 8, 16, 8, 16
+    8, 8, 16, 8, 16, 8
   };
 
 
   /**
    * The number of pixel formats
    */
-  public static final int NUMPF   = 11;
+  public static final int NUMPF   = 12;
   /**
    * RGB pixel format.  The red, green, and blue components in the image are
    * stored in 3-byte pixels in the order R, G, B from lowest to highest byte
@@ -180,6 +191,22 @@
    * interpreted as an opaque alpha channel.
    */
   public static final int PF_ARGB = 10;
+  /**
+   * CMYK pixel format.  Unlike RGB, which is an additive color model used
+   * primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+   * color model used primarily for printing.  In the CMYK color model, the
+   * value of each color component typically corresponds to an amount of cyan,
+   * magenta, yellow, or black ink that is applied to a white background.  In
+   * order to convert between CMYK and RGB, it is necessary to use a color
+   * management system (CMS.)  A CMS will attempt to map colors within the
+   * printer's gamut to perceptually similar colors in the display's gamut and
+   * vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+   * be defined with a simple formula.  Thus, such a conversion is out of scope
+   * for a codec library.  However, the TurboJPEG API allows for compressing
+   * CMYK pixels into a YCCK JPEG image (see {@link #CS_YCCK}) and
+   * decompressing YCCK JPEG images into CMYK pixels.
+   */
+  public static final int PF_CMYK = 11;
 
 
   /**
@@ -196,7 +223,7 @@
   }
 
   private static final int[] pixelSize = {
-    3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4
+    3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4
   };
 
 
@@ -218,7 +245,7 @@
   }
 
   private static final int[] redOffset = {
-    0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1
+    0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1, -1
   };
 
 
@@ -240,7 +267,7 @@
   }
 
   private static final int[] greenOffset = {
-    1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2
+    1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2, -1
   };
 
 
@@ -262,11 +289,66 @@
   }
 
   private static final int[] blueOffset = {
-    2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3
+    2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3, -1
   };
 
 
   /**
+   * The number of JPEG colorspaces
+   */
+  public static final int NUMCS = 5;
+  /**
+   * RGB colorspace.  When compressing the JPEG image, the R, G, and B
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  RGB JPEG images can be
+   * decompressed to any of the extended RGB pixel formats or grayscale, but
+   * they cannot be decompressed to YUV images.
+   */
+  public static final int CS_RGB = 0;
+  /**
+   * YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+   * mathematical transformation of RGB designed solely for storage and
+   * transmission.  YCbCr images must be converted to RGB before they can
+   * actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+   * component represents the black & white portion of the original image, and
+   * the Cb and Cr (chrominance) components represent the color portion of the
+   * original image.  Originally, the analog equivalent of this transformation
+   * allowed the same signal to drive both black & white and color televisions,
+   * but JPEG images use YCbCr primarily because it allows the color data to be
+   * optionally subsampled for the purposes of reducing bandwidth or disk
+   * space.  YCbCr is the most common JPEG colorspace, and YCbCr JPEG images
+   * can be compressed from and decompressed to any of the extended RGB pixel
+   * formats or grayscale, or they can be decompressed to YUV planar images.
+   */
+  public static final int CS_YCbCr = 1;
+  /**
+   * Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+   * component), and any color data from the source image is discarded.
+   * Grayscale JPEG images can be compressed from and decompressed to any of
+   * the extended RGB pixel formats or grayscale, or they can be decompressed
+   * to YUV planar images.
+   */
+  public static final int CS_GRAY = 2;
+  /**
+   * CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  CMYK JPEG images can
+   * only be decompressed to CMYK pixels.
+   */
+  public static final int CS_CMYK = 3;
+  /**
+   * YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+   * rather a mathematical transformation of CMYK designed solely for storage
+   * and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+   * reversibly transformed into YCCK, and as with YCbCr, the chrominance
+   * components in the YCCK pixels can be subsampled without incurring major
+   * perceptual loss.  YCCK JPEG images can only be compressed from and
+   * decompressed to CMYK pixels.
+   */
+  public static final int CS_YCCK = 4;
+
+
+  /**
    * The uncompressed source/destination image is stored in bottom-up (Windows,
    * OpenGL) order, not top-down (X11) order.
    */
@@ -343,6 +425,10 @@
    *
    * @param width the width (in pixels) of the YUV image
    *
+   * @param pad the width of each line in each plane of the image is padded to
+   *        the nearest multiple of this number of bytes (must be a power of
+   *        2.)
+   *
    * @param height the height (in pixels) of the YUV image
    *
    * @param subsamp the level of chrominance subsampling used in the YUV
@@ -351,6 +437,14 @@
    * @return the size of the buffer (in bytes) required to hold a YUV planar
    * image with the given width, height, and level of chrominance subsampling
    */
+  public static native int bufSizeYUV(int width, int pad, int height,
+                                      int subsamp)
+    throws Exception;
+
+  /**
+   * @deprecated Use {@link #bufSizeYUV(int, int, int, int)} instead.
+   */
+  @Deprecated
   public static native int bufSizeYUV(int width, int height, int subsamp)
     throws Exception;
 
diff --git a/java/org/libjpegturbo/turbojpeg/TJCompressor.java b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
index f8f82ac..e8dd260 100644
--- a/java/org/libjpegturbo/turbojpeg/TJCompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
@@ -95,8 +95,8 @@
   /**
    * Associate an uncompressed source image with this compressor instance.
    *
-   * @param srcImage image buffer containing RGB or grayscale pixels to be
-   * compressed
+   * @param srcImage image buffer containing RGB, grayscale, or CMYK pixels to
+   * be compressed
    *
    * @param x x offset (in pixels) of the region from which the JPEG image
    * should be compressed, relative to the start of <code>srcImage</code>.
@@ -145,6 +145,7 @@
    * @deprecated Use
    * {@link #setSourceImage(byte[], int, int, int, int, int, int)} instead.
    */
+  @Deprecated
   public void setSourceImage(byte[] srcImage, int width, int pitch,
                              int height, int pixelFormat) throws Exception {
     setSourceImage(srcImage, 0, 0, width, pitch, height, pixelFormat);
@@ -154,7 +155,13 @@
 
   /**
    * Set the level of chrominance subsampling for subsequent compress/encode
-   * operations.
+   * operations.  When pixels are converted from RGB to YCbCr (see
+   * {@link TJ#CS_YCbCr}) or from CMYK to YCCK (see {@link TJ#CS_YCCK}) as part
+   * of the JPEG compression process, some of the Cb and Cr (chrominance)
+   * components can be discarded or averaged together to produce a smaller
+   * image with little perceptible loss of image clarity (the human eye is more
+   * sensitive to small changes in brightness than to small changes in color.)
+   * This is called "chrominance subsampling".
    *
    * @param newSubsamp the new level of chrominance subsampling (one of
    * {@link TJ TJ.SAMP_*})
@@ -330,20 +337,38 @@
     return buf;
   }
 
+
+  /**
+   * Set the plane padding for subsequent YUV encode operations.
+   *
+   * @param pad the width of each line in each plane of the YUV image will be
+   *        padded to the nearest multiple of this number of bytes (must be a
+   *        power of 2.)  The default padding is 4 bytes, which generates
+   *        images suitable for direct video display.
+   */
+  public void setYUVPad(int pad) throws Exception {
+    if(pad < 1 || ((pad & (pad - 1)) != 0))
+      throw new Exception("Invalid argument in setYUVPad()");
+    yuvPad = pad;
+  }
+
   /**
    * Encode the uncompressed source image associated with this compressor
    * instance and output a YUV planar image to the given destination buffer.
-   * This method uses the accelerated color conversion routines in
-   * TurboJPEG's underlying codec to produce a planar YUV image that is
-   * suitable for direct video display.  Specifically, if the chrominance
-   * components are subsampled along the horizontal dimension, then the width
-   * of the luminance plane is padded to the nearest multiple of 2 in the
-   * output image (same goes for the height of the luminance plane, if the
+   * This method uses the accelerated color conversion routines in TurboJPEG's
+   * underlying codec but does not execute any of the other steps in the JPEG
+   * compression process.  The Y, U (Cb), and V (Cr) image planes are stored
+   * sequentially into the destination buffer, and the size of each plane is
+   * determined by the width and height of the source image, as well as the
+   * specified padding and level of chrominance subsampling.  If the
+   * chrominance components are subsampled along the horizontal dimension, then
+   * the width of the luminance plane is padded to the nearest multiple of 2 in
+   * the output image (same goes for the height of the luminance plane, if the
    * chrominance components are subsampled along the vertical dimension.)
-   * Also, each line of each plane in the output image is padded to 4 bytes.
-   * Although this will work with any subsampling option, it is really only
-   * useful in combination with {@link TJ#SAMP_420}, which produces an image
-   * compatible with the I420 (AKA "YUV420P") format.
+   * <p>
+   * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
+   * convention of the digital video community, the TurboJPEG API uses "YUV" to
+   * refer to an image format consisting of Y, Cb, and Cr image planes.
    *
    * @param dstBuf buffer that will receive the YUV planar image.  Use
    * {@link TJ#bufSizeYUV} to determine the appropriate size for this buffer
@@ -358,9 +383,9 @@
       throw new Exception(NO_ASSOC_ERROR);
     if (subsamp < 0)
       throw new Exception("Subsampling level not set");
-    encodeYUV(srcBuf, srcWidth, srcPitch, srcHeight,
-              srcPixelFormat, dstBuf, subsamp, flags);
-    compressedSize = TJ.bufSizeYUV(srcWidth, srcHeight, subsamp);
+    encodeYUV(srcBuf, srcWidth, srcPitch, srcHeight, srcPixelFormat, dstBuf,
+              yuvPad, subsamp, flags);
+    compressedSize = TJ.bufSizeYUV(srcWidth, yuvPad, srcHeight, subsamp);
   }
 
   /**
@@ -377,7 +402,7 @@
       throw new Exception(NO_ASSOC_ERROR);
     if (subsamp < 0)
       throw new Exception("Subsampling level not set");
-    byte[] buf = new byte[TJ.bufSizeYUV(srcWidth, srcHeight, subsamp)];
+    byte[] buf = new byte[TJ.bufSizeYUV(srcWidth, yuvPad, srcHeight, subsamp)];
     encodeYUV(buf, flags);
     return buf;
   }
@@ -438,8 +463,8 @@
       int stride = sm.getScanlineStride();
       DataBufferInt db = (DataBufferInt)wr.getDataBuffer();
       int[] buf = db.getData();
-      encodeYUV(buf, width, stride, height, pixelFormat, dstBuf, subsamp,
-                flags);
+      encodeYUV(buf, width, stride, height, pixelFormat, dstBuf, yuvPad,
+                subsamp, flags);
     } else {
       ComponentSampleModel sm =
         (ComponentSampleModel)srcImage.getSampleModel();
@@ -449,10 +474,10 @@
       int pitch = sm.getScanlineStride();
       DataBufferByte db = (DataBufferByte)wr.getDataBuffer();
       byte[] buf = db.getData();
-      encodeYUV(buf, width, pitch, height, pixelFormat, dstBuf, subsamp,
-                flags);
+      encodeYUV(buf, width, pitch, height, pixelFormat, dstBuf, yuvPad,
+                subsamp, flags);
     }
-    compressedSize = TJ.bufSizeYUV(width, height, subsamp);
+    compressedSize = TJ.bufSizeYUV(width, yuvPad, height, subsamp);
   }
 
   /**
@@ -472,7 +497,7 @@
       throw new Exception("Subsampling level not set");
     int width = srcImage.getWidth();
     int height = srcImage.getHeight();
-    byte[] buf = new byte[TJ.bufSizeYUV(width, height, subsamp)];
+    byte[] buf = new byte[TJ.bufSizeYUV(width, yuvPad, height, subsamp)];
     encodeYUV(srcImage, buf, flags);
     return buf;
   }
@@ -527,11 +552,19 @@
 
   private native void encodeYUV(byte[] srcBuf, int width, int pitch,
     int height, int pixelFormat, byte[] dstBuf, int subsamp, int flags)
-    throws Exception;
+    throws Exception; // deprecated
+
+  private native void encodeYUV(byte[] srcBuf, int width, int pitch,
+    int height, int pixelFormat, byte[] dstBuf, int pad, int subsamp,
+    int flags) throws Exception;
 
   private native void encodeYUV(int[] srcBuf, int width, int stride,
     int height, int pixelFormat, byte[] dstBuf, int subsamp, int flags)
-    throws Exception;
+    throws Exception; // deprecated
+
+  private native void encodeYUV(int[] srcBuf, int width, int pitch,
+    int height, int pixelFormat, byte[] dstBuf, int pad, int subsamp,
+    int flags) throws Exception;
 
   static {
     TJLoader.load();
@@ -548,5 +581,6 @@
   private int subsamp = -1;
   private int jpegQuality = -1;
   private int compressedSize = 0;
+  private int yuvPad = 4;
   private ByteOrder byteOrder = null;
 };
diff --git a/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java b/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
index 6e46fa1..d862d66 100644
--- a/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
+++ b/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
@@ -39,7 +39,7 @@
   /**
    * A callback function that can be used to modify the DCT coefficients after
    * they are losslessly transformed but before they are transcoded to a new
-   * JPEG file.  This allows for custom filters or other transformations to be
+   * JPEG image.  This allows for custom filters or other transformations to be
    * applied in the frequency domain.
    *
    * @param coeffBuffer a buffer containing transformed DCT coefficients.
diff --git a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
index c2d361e..d14a989 100644
--- a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
@@ -117,7 +117,7 @@
 
   /**
    * Returns the level of chrominance subsampling used in the JPEG image
-   * associated with this decompressor instance.
+   * associated with this decompressor instance.  See {@link TJ TJ.SAMP_*}.
    *
    * @return the level of chrominance subsampling used in the JPEG image
    * associated with this decompressor instance
@@ -131,6 +131,21 @@
   }
 
   /**
+   * Returns the colorspace used in the JPEG image associated with this
+   * decompressor instance.  See {@link TJ TJ.CS_*}.
+   *
+   * @return the colorspace used in the JPEG image associated with this
+   * decompressor instance
+   */
+  public int getColorspace() throws Exception {
+    if (jpegColorspace < 0)
+      throw new Exception(NO_ASSOC_ERROR);
+    if (jpegColorspace >= TJ.NUMCS)
+      throw new Exception("JPEG header information is invalid");
+    return jpegColorspace;
+  }
+
+  /**
    * Returns the JPEG image buffer associated with this decompressor instance.
    *
    * @return the JPEG image buffer associated with this decompressor instance
@@ -313,6 +328,7 @@
    * @deprecated Use
    * {@link #decompress(byte[], int, int, int, int, int, int, int)} instead.
    */
+  @Deprecated
   public void decompress(byte[] dstBuf, int desiredWidth, int pitch,
                          int desiredHeight, int pixelFormat, int flags)
                          throws Exception {
@@ -368,44 +384,100 @@
    * that, if the width or height of the image is not an even multiple of the
    * MCU block size (see {@link TJ#getMCUWidth} and {@link TJ#getMCUHeight}),
    * then an intermediate buffer copy will be performed within TurboJPEG.
+   * <p>
+   * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
+   * convention of the digital video community, the TurboJPEG API uses "YUV" to
+   * refer to an image format consisting of Y, Cb, and Cr image planes.
    *
    * @param dstBuf buffer that will receive the YUV planar image.  Use
    * {@link TJ#bufSizeYUV} to determine the appropriate size for this buffer
    * based on the image width, height, and level of chrominance subsampling.
    *
+   * @param desiredWidth desired width (in pixels) of the YUV image.  If the
+   * desired image dimensions are different than the dimensions of the JPEG
+   * image being decompressed, then TurboJPEG will use scaling in the JPEG
+   * decompressor to generate the largest possible image that will fit within
+   * the desired dimensions.  Setting this to 0 is the same as setting it to
+   * the width of the JPEG image (in other words, the width will not be
+   * considered when determining the scaled image size.)
+   *
+   * @param pad the width of each line in each plane of the YUV image will be
+   * padded to the nearest multiple of this number of bytes (must be a power of
+   * 2.)
+   *
+   * @param desiredHeight desired height (in pixels) of the YUV image.  If the
+   * desired image dimensions are different than the dimensions of the JPEG
+   * image being decompressed, then TurboJPEG will use scaling in the JPEG
+   * decompressor to generate the largest possible image that will fit within
+   * the desired dimensions.  Setting this to 0 is the same as setting it to
+   * the height of the JPEG image (in other words, the height will not be
+   * considered when determining the scaled image size.)
+   *
    * @param flags the bitwise OR of one or more of {@link TJ TJ.FLAG_*}
    */
-  public void decompressToYUV(byte[] dstBuf, int flags) throws Exception {
+  public void decompressToYUV(byte[] dstBuf, int desiredWidth, int pad,
+                              int desiredHeight, int flags) throws Exception {
     if (jpegBuf == null)
       throw new Exception(NO_ASSOC_ERROR);
-    if (dstBuf == null || flags < 0)
+    if (dstBuf == null || desiredWidth < 0 || pad < 1 ||
+        ((pad & (pad - 1)) != 0) || desiredHeight < 0 || flags < 0)
       throw new Exception("Invalid argument in decompressToYUV()");
-    decompressToYUV(jpegBuf, jpegBufSize, dstBuf, flags);
+    decompressToYUV(jpegBuf, jpegBufSize, dstBuf, desiredWidth, pad,
+                    desiredHeight, flags);
   }
 
+  /**
+   * @deprecated Use {@link #decompressToYUV(byte[], int, int, int, int)}
+   * instead.
+   */
+  @Deprecated
+  public void decompressToYUV(byte[] dstBuf, int flags) throws Exception {
+    decompressToYUV(dstBuf, 0, 4, 0, flags);
+  }
 
   /**
    * Decompress the JPEG source image associated with this decompressor
    * instance and return a buffer containing a YUV planar image.  See {@link
-   * #decompressToYUV(byte[], int)} for more detail.
+   * #decompressToYUV(byte[], int, int, int, int)} for more detail.
+   *
+   * @param desiredWidth see
+   * {@link #decompressToYUV(byte[], int, int, int, int)} for description
+   *
+   * @param pad see {@link #decompressToYUV(byte[], int, int, int, int)} for
+   * description
+   *
+   * @param desiredHeight see {@link
+   * #decompressToYUV(byte[], int, int, int, int)} for description
    *
    * @param flags the bitwise OR of one or more of {@link TJ TJ.FLAG_*}
    *
    * @return a buffer containing a YUV planar image
    */
-  public byte[] decompressToYUV(int flags) throws Exception {
+  public byte[] decompressToYUV(int desiredWidth, int pad, int desiredHeight,
+                                int flags) throws Exception {
     if (flags < 0)
       throw new Exception("Invalid argument in decompressToYUV()");
     if (jpegWidth < 1 || jpegHeight < 1 || jpegSubsamp < 0)
       throw new Exception(NO_ASSOC_ERROR);
     if (jpegSubsamp >= TJ.NUMSAMP)
       throw new Exception("JPEG header information is invalid");
-    byte[] buf = new byte[TJ.bufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp)];
-    decompressToYUV(buf, flags);
+    int scaledWidth = getScaledWidth(desiredWidth, desiredHeight);
+    int scaledHeight = getScaledHeight(desiredWidth, desiredHeight);
+    byte[] buf = new byte[TJ.bufSizeYUV(scaledWidth, pad, scaledHeight,
+                                        jpegSubsamp)];
+    decompressToYUV(buf, desiredWidth, pad, desiredHeight, flags);
     return buf;
   }
 
   /**
+   * @deprecated Use {@link #decompressToYUV(int, int, int, int)} instead.
+   */
+  @Deprecated
+  public byte[] decompressToYUV(int flags) throws Exception {
+    return decompressToYUV(0, 4, 0, flags);
+  }
+
+  /**
    * Decompress the JPEG source image associated with this decompressor
    * instance and output a decompressed image to the given destination buffer.
    *
@@ -619,7 +691,10 @@
     int flags) throws Exception;
 
   private native void decompressToYUV(byte[] srcBuf, int size, byte[] dstBuf,
-    int flags) throws Exception;
+    int flags) throws Exception; // deprecated
+
+  private native void decompressToYUV(byte[] srcBuf, int size, byte[] dstBuf,
+    int desiredWidth, int pad, int desiredheight, int flags) throws Exception;
 
   static {
     TJLoader.load();
@@ -631,5 +706,6 @@
   protected int jpegWidth = 0;
   protected int jpegHeight = 0;
   protected int jpegSubsamp = -1;
+  protected int jpegColorspace = -1;
   private ByteOrder byteOrder = null;
 };
diff --git a/java/org_libjpegturbo_turbojpeg_TJ.h b/java/org_libjpegturbo_turbojpeg_TJ.h
index d7b032a..d590831 100644
--- a/java/org_libjpegturbo_turbojpeg_TJ.h
+++ b/java/org_libjpegturbo_turbojpeg_TJ.h
@@ -8,7 +8,7 @@
 extern "C" {
 #endif
 #undef org_libjpegturbo_turbojpeg_TJ_NUMSAMP
-#define org_libjpegturbo_turbojpeg_TJ_NUMSAMP 5L
+#define org_libjpegturbo_turbojpeg_TJ_NUMSAMP 6L
 #undef org_libjpegturbo_turbojpeg_TJ_SAMP_444
 #define org_libjpegturbo_turbojpeg_TJ_SAMP_444 0L
 #undef org_libjpegturbo_turbojpeg_TJ_SAMP_422
@@ -19,8 +19,10 @@
 #define org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY 3L
 #undef org_libjpegturbo_turbojpeg_TJ_SAMP_440
 #define org_libjpegturbo_turbojpeg_TJ_SAMP_440 4L
+#undef org_libjpegturbo_turbojpeg_TJ_SAMP_411
+#define org_libjpegturbo_turbojpeg_TJ_SAMP_411 5L
 #undef org_libjpegturbo_turbojpeg_TJ_NUMPF
-#define org_libjpegturbo_turbojpeg_TJ_NUMPF 11L
+#define org_libjpegturbo_turbojpeg_TJ_NUMPF 12L
 #undef org_libjpegturbo_turbojpeg_TJ_PF_RGB
 #define org_libjpegturbo_turbojpeg_TJ_PF_RGB 0L
 #undef org_libjpegturbo_turbojpeg_TJ_PF_BGR
@@ -43,6 +45,20 @@
 #define org_libjpegturbo_turbojpeg_TJ_PF_ABGR 9L
 #undef org_libjpegturbo_turbojpeg_TJ_PF_ARGB
 #define org_libjpegturbo_turbojpeg_TJ_PF_ARGB 10L
+#undef org_libjpegturbo_turbojpeg_TJ_PF_CMYK
+#define org_libjpegturbo_turbojpeg_TJ_PF_CMYK 11L
+#undef org_libjpegturbo_turbojpeg_TJ_NUMCS
+#define org_libjpegturbo_turbojpeg_TJ_NUMCS 5L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_RGB
+#define org_libjpegturbo_turbojpeg_TJ_CS_RGB 0L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_YCbCr
+#define org_libjpegturbo_turbojpeg_TJ_CS_YCbCr 1L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_GRAY
+#define org_libjpegturbo_turbojpeg_TJ_CS_GRAY 2L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_CMYK
+#define org_libjpegturbo_turbojpeg_TJ_CS_CMYK 3L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_YCCK
+#define org_libjpegturbo_turbojpeg_TJ_CS_YCCK 4L
 #undef org_libjpegturbo_turbojpeg_TJ_FLAG_BOTTOMUP
 #define org_libjpegturbo_turbojpeg_TJ_FLAG_BOTTOMUP 2L
 #undef org_libjpegturbo_turbojpeg_TJ_FLAG_FORCEMMX
@@ -70,9 +86,17 @@
 /*
  * Class:     org_libjpegturbo_turbojpeg_TJ
  * Method:    bufSizeYUV
+ * Signature: (IIII)I
+ */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
+  (JNIEnv *, jclass, jint, jint, jint, jint);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJ
+ * Method:    bufSizeYUV
  * Signature: (III)I
  */
-JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III
   (JNIEnv *, jclass, jint, jint, jint);
 
 /*
diff --git a/java/org_libjpegturbo_turbojpeg_TJCompressor.h b/java/org_libjpegturbo_turbojpeg_TJCompressor.h
index 2fc9136..afec077 100644
--- a/java/org_libjpegturbo_turbojpeg_TJCompressor.h
+++ b/java/org_libjpegturbo_turbojpeg_TJCompressor.h
@@ -66,11 +66,27 @@
 /*
  * Class:     org_libjpegturbo_turbojpeg_TJCompressor
  * Method:    encodeYUV
+ * Signature: ([BIIII[BIII)V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BIII
+  (JNIEnv *, jobject, jbyteArray, jint, jint, jint, jint, jbyteArray, jint, jint, jint);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJCompressor
+ * Method:    encodeYUV
  * Signature: ([IIIII[BII)V
  */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
   (JNIEnv *, jobject, jintArray, jint, jint, jint, jint, jbyteArray, jint, jint);
 
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJCompressor
+ * Method:    encodeYUV
+ * Signature: ([IIIII[BIII)V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BIII
+  (JNIEnv *, jobject, jintArray, jint, jint, jint, jint, jbyteArray, jint, jint, jint);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/java/org_libjpegturbo_turbojpeg_TJDecompressor.h b/java/org_libjpegturbo_turbojpeg_TJDecompressor.h
index f798a77..203f004 100644
--- a/java/org_libjpegturbo_turbojpeg_TJDecompressor.h
+++ b/java/org_libjpegturbo_turbojpeg_TJDecompressor.h
@@ -68,9 +68,17 @@
  * Method:    decompressToYUV
  * Signature: ([BI[BI)V
  */
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI
   (JNIEnv *, jobject, jbyteArray, jint, jbyteArray, jint);
 
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJDecompressor
+ * Method:    decompressToYUV
+ * Signature: ([BI[BIIII)V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BIIII
+  (JNIEnv *, jobject, jbyteArray, jint, jbyteArray, jint, jint, jint, jint);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/release/libjpeg-turbo.spec.in b/release/libjpeg-turbo.spec.in
index 6c97814..616618d 100644
--- a/release/libjpeg-turbo.spec.in
+++ b/release/libjpeg-turbo.spec.in
@@ -124,7 +124,7 @@
 %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@
 %{_libdir}/libjpeg.so
 %{_libdir}/libjpeg.a
-%{_libdir}/libturbojpeg.so.0.0.0
+%{_libdir}/libturbojpeg.so.0.1.0
 %{_libdir}/libturbojpeg.so.0
 %{_libdir}/libturbojpeg.so
 %{_libdir}/libturbojpeg.a
diff --git a/simd/Makefile.am b/simd/Makefile.am
index a12ff6e..7b59298 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -58,6 +58,12 @@
 
 endif
 
+if SIMD_MIPSEL
+
+libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2_asm.h jsimd_mips_dspr2.S
+
+endif
+
 AM_CPPFLAGS = -I$(top_srcdir) 
 
 .asm.lo:
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 3d4751f..907e852 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -3,6 +3,7 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2011 D. R. Commander
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
  * 
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,6 +19,7 @@
 #define JSIMD_SSE        0x04
 #define JSIMD_SSE2       0x08
 #define JSIMD_ARM_NEON   0x10
+#define JSIMD_MIPS_DSPR2 0x20
 
 /* Short forms of external names for systems with brain-damaged linkers. */
 
@@ -386,6 +388,64 @@
              JSAMPIMAGE input_buf, JDIMENSION input_row,
              JSAMPARRAY output_buf, int num_rows));
 
+EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN (void) jsimd_ycc_rgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
 /* SIMD Downsample */
 EXTERN(void) jsimd_h2v2_downsample_mmx
         JPP((JDIMENSION image_width, int max_v_samp_factor,
@@ -405,6 +465,15 @@
              JDIMENSION v_samp_factor, JDIMENSION width_blocks,
              JSAMPARRAY input_data, JSAMPARRAY output_data));
 
+EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
 /* SIMD Upsample */
 EXTERN(void) jsimd_h2v2_upsample_mmx
         JPP((int max_v_samp_factor, JDIMENSION output_width,
@@ -526,6 +595,20 @@
         JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
              JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 
+EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_upsample_mips_dspr2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
 /* SIMD Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
                                      JDIMENSION start_col,
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
new file mode 100644
index 0000000..e0cefb0
--- /dev/null
+++ b/simd/jsimd_mips.c
@@ -0,0 +1,545 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011 D. R. Commander
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on
+ * MIPS architecture.
+ *
+ * Based on the stubs from 'jsimd_none.c'
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+LOCAL(int)
+parse_proc_cpuinfo(const char* search_string)
+{
+  const char* file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE* f = NULL;
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_MIPS_DSPR2;
+        return 1;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+  return 0;
+}
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_MIPS_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  if (!parse_proc_cpuinfo("MIPS 74K"))
+    return;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
+
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
+      break;
+    default:
+      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
+      break;
+  default:
+      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width,
+        cinfo->max_v_samp_factor, compptr->v_samp_factor,
+        compptr->width_in_blocks, input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width,
+        cinfo->max_v_samp_factor, compptr->v_samp_factor,
+        compptr->width_in_blocks, input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
new file mode 100644
index 0000000..83744b8
--- /dev/null
+++ b/simd/jsimd_mips_dspr2.S
@@ -0,0 +1,840 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_mips_dspr2_asm.h"
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_mips_dspr2
+ * jsimd_extbgr_ycc_convert_mips_dspr2
+ * jsimd_extrgbx_ycc_convert_mips_dspr2
+ * jsimd_extbgrx_ycc_convert_mips_dspr2
+ * jsimd_extxbgr_ycc_convert_mips_dspr2
+ * jsimd_extxrgb_ycc_convert_mips_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC r,    \
+                     g,    \
+                     b,    \
+                     inptr
+    lbu     \r, \r_offs(\inptr)
+    lbu     \g, \g_offs(\inptr)
+    lbu     \b, \b_offs(\inptr)
+    addiu   \inptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - output_buf
+ * a3     - output_row
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw      t7, 48(sp)        // t7 = num_rows
+    li      s0, 0x4c8b        // FIX(0.29900)
+    li      s1, 0x9646        // FIX(0.58700)
+    li      s2, 0x1d2f        // FIX(0.11400)
+    li      s3, 0xffffd4cd    // -FIX(0.16874)
+    li      s4, 0xffffab33    // -FIX(0.33126)
+    li      s5, 0x8000        // FIX(0.50000)
+    li      s6, 0xffff94d1    // -FIX(0.41869)
+    li      s7, 0xffffeb2f    // -FIX(0.08131)
+    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
+
+0:
+    addiu   t7, -1            // --num_rows
+    lw      t6, 0(a1)         // t6 = input_buf[0]
+    lw      t0, 0(a2)
+    lw      t1, 4(a2)
+    lw      t2, 8(a2)
+    sll     t3, a3, 2
+    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
+    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
+    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
+
+    addu    t9, t2, a0        // t9 = end address
+    addiu   a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo    s5, $ac0
+    mtlo    t8, $ac1
+    mtlo    t8, $ac2
+    maddu   $ac0, s2, t5
+    maddu   $ac1, s5, t5
+    maddu   $ac2, s5, t3
+    maddu   $ac0, s0, t3
+    maddu   $ac1, s3, t3
+    maddu   $ac2, s6, t4
+    maddu   $ac0, s1, t4
+    maddu   $ac1, s4, t4
+    maddu   $ac2, s7, t5
+    extr.w  t3, $ac0, 16
+    extr.w  t4, $ac1, 16
+    extr.w  t5, $ac2, 16
+    sb      t3, 0(t0)
+    sb      t4, 0(t1)
+    sb      t5, 0(t2)
+    addiu   t0, 1
+    addiu   t2, 1
+    bne     t2, t9, 1b
+     addiu  t1, 1
+    bgtz    t7, 0b
+     addiu  a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*------------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_mips_dspr2
+ * jsimd_ycc_extbgr_convert_mips_dspr2
+ * jsimd_ycc_extrgbx_convert_mips_dspr2
+ * jsimd_ycc_extbgrx_convert_mips_dspr2
+ * jsimd_ycc_extxbgr_convert_mips_dspr2
+ * jsimd_ycc_extxrgb_convert_mips_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 \
+                         scratch1 \
+                         scratch2 \
+                         outptr
+    sb       \scratch0, \r_offs(\outptr)
+    sb       \scratch1, \g_offs(\outptr)
+    sb       \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li       t0, 0xFF
+    sb       t0, \a_offs(\outptr)
+.endif
+    addiu    \outptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - input_row
+ * a3     - output_buf
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw         s1, 48(sp)
+    li         t3, 0x8000
+    li         t4, 0x166e9     // FIX(1.40200)
+    li         t5, 0x1c5a2     // FIX(1.77200)
+    li         t6, 0xffff492e  // -FIX(0.71414)
+    li         t7, 0xffffa7e6  // -FIX(0.34414)
+    repl.ph    t8, 128
+
+0:
+    lw         s0, 0(a3)
+    lw         t0, 0(a1)
+    lw         t1, 4(a1)
+    lw         t2, 8(a1)
+    sll        s5, a2, 2
+    addiu      s1, -1
+    lwx        s2, s5(t0)
+    lwx        s3, s5(t1)
+    lwx        s4, s5(t2)
+    addu       t9, s2, a0
+    addiu      a2, 1
+
+1:
+    lbu        s7, 0(s4)       // cr
+    lbu        s6, 0(s3)       // cb
+    lbu        s5, 0(s2)       // y
+    addiu      s2, 1
+    addiu      s4, 1
+    addiu      s7, -128
+    addiu      s6, -128
+    mul        t2, t7, s6
+    mul        t0, t6, s7      // Crgtab[cr]
+    sll        s7, 15
+    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
+    sll        s6, 15
+    addu       t2, t3          // Cbgtab[cb]
+    addu       t2, t0
+
+    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
+    sra        t2, 16
+    addu       t1, s5
+    addu       t2, s5          // add y
+    ins        t2, t1, 16, 16
+    subu.ph    t2, t2, t8
+    addu       t0, s5
+    shll_s.ph  t2, t2, 8
+    subu       t0, 128
+    shra.ph    t2, t2, 8
+    shll_s.w   t0, t0, 24
+    addu.ph    t2, t2, t8      // clip & store
+    sra        t0, t0, 24
+    sra        t1, t2, 16
+    addiu      t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne        s2, t9, 1b
+     addiu     s3, 1
+    bgtz       s1, 0b
+     addiu     a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*------------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_mips_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - downsampled_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    li             s4, 0
+    lw             s2, 0(a3)       // s2 = *output_data_ptr
+0:
+    li             t9, 2
+    lw             s1, -4(a2)      // s1 = inptr1
+
+1:
+    lw             s0, 0(a2)       // s0 = inptr0
+    lwx            s3, s4(s2)
+    addiu          s5, a1, -2      // s5 = downsampled_width - 2
+    srl            t4, s5, 1
+    sll            t4, t4, 1
+    lbu            t0, 0(s0)
+    lbu            t1, 1(s0)
+    lbu            t2, 0(s1)
+    lbu            t3, 1(s1)
+    addiu          s0, 2
+    addiu          s1, 2
+    addu           t8, s0, t4      // t8 = end address
+    andi           s5, s5, 1       // s5 = residual
+    sll            t4, t0, 1
+    sll            t6, t1, 1
+    addu           t0, t0, t4      // t0 = (*inptr0++) * 3
+    addu           t1, t1, t6      // t1 = (*inptr0++) * 3
+    addu           t7, t0, t2      // t7 = thiscolsum
+    addu           t6, t1, t3      // t5 = nextcolsum
+    sll            t0, t7, 2       // t0 = thiscolsum * 4
+    subu           t1, t0, t7      // t1 = thiscolsum * 3
+    shra_r.w       t0, t0, 4
+    addiu          t1, 7
+    addu           t1, t1, t6
+    srl            t1, t1, 4
+    sb             t0, 0(s3)
+    sb             t1, 1(s3)
+    addiu          s3, 2
+2:
+    lh             t0, 0(s0)       // t0 = A3|A2
+    lh             t2, 0(s1)       // t2 = B3|B2
+    addiu          s0, 2
+    addiu          s1, 2
+    preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
+    preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
+    shll.ph        t1, t0, 1
+    sll            t3, t6, 1
+    addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
+    addu           t3, t3, t6      // t3 = this * 3
+    addu.ph        t0, t0, t2      // t0 = next2|next1
+    addu           t1, t3, t7
+    andi           t7, t0, 0xFFFF  // t7 = next1
+    sll            t2, t7, 1
+    addu           t2, t7, t2      // t2 = next1*3
+    addu           t4, t2, t6
+    srl            t6, t0, 16      // t6 = next2
+    shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
+    addu           t0, t3, t7
+    addiu          t0, 7
+    srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
+    shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
+    addu           t2, t2, t6
+    addiu          t2, 7
+    srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    sb             t4, 2(s3)
+    sb             t2, 3(s3)
+    bne            t8, s0, 2b
+     addiu         s3, 4
+    beqz           s5, 4f
+     addu          t8, s0, s5
+3:
+    lbu            t0, 0(s0)
+    lbu            t2, 0(s1)
+    addiu          s0, 1
+    addiu          s1, 1
+    sll            t3, t6, 1
+    sll            t1, t0, 1
+    addu           t1, t0, t1      // t1 = inptr0 * 3
+    addu           t3, t3, t6      // t3 = thiscolsum * 3
+    addu           t5, t1, t2
+    addu           t1, t3, t7
+    shra_r.w       t1, t1, 4
+    addu           t0, t3, t5
+    addiu          t0, 7
+    srl            t0, t0, 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    addiu          s3, 2
+    move           t7, t6
+    bne            t8, s0, 3b
+     move          t6, t5
+4:
+    sll            t0, t6, 2       // t0 = thiscolsum * 4
+    subu           t1, t0, t6      // t1 = thiscolsum * 3
+    addu           t1, t1, t7
+    addiu          s4, 4
+    shra_r.w       t1, t1, 4
+    addiu          t0, 7
+    srl            t0, t0, 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    addiu          t9, -1
+    addiu          s3, 2
+    bnez           t9, 1b
+     lw            s1, 4(a2)
+    srl            t0, s4, 2
+    subu           t0, a0, t0
+    bgtz           t0, 0b
+     addiu         a2, 4
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j ra
+     nop
+END(jsimd_h2v2_fancy_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - downsampled_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    .set at
+
+    beqz           a0, 3f
+     sll           t0, a0, 2
+    lw             s1, 0(a3)
+    addu           s0, s1, t0
+    li             s3, 0x10001
+0:
+    addiu          t8, a1, -2
+    srl            t9, t8, 2
+    lw             t7, 0(a2)
+    lw             s2, 0(s1)
+    lbu            t0, 0(t7)
+    lbu            t1, 1(t7)   // t1 = inptr[1]
+    sll            t2, t0, 1
+    addu           t2, t2, t0  // t2 = invalue*3
+    addu           t2, t2, t1
+    shra_r.w       t2, t2, 2
+    sb             t0, 0(s2)
+    sb             t2, 1(s2)
+    beqz           t9, 11f
+     addiu         s2, 2
+1:
+    ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
+    ulw            t1, 1(t7)
+    ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
+    preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
+    preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
+    preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
+    preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
+    preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
+    shll.ph        t5, t4, 1
+    shll.ph        t6, t1, 1
+    addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
+    addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
+    addu.ph        t4, t3, s3
+    addu.ph        t0, t0, s3
+    addu.ph        t4, t4, t5
+    addu.ph        t0, t0, t6
+    shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
+    shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
+    addu.ph        t2, t2, t5
+    addu.ph        t3, t3, t6
+    shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
+    shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
+    shll.ph        t2, t2, 8
+    shll.ph        t3, t3, 8
+    or             t2, t4, t2
+    or             t3, t3, t0
+    addiu          t9, -1
+    usw            t3, 0(s2)
+    usw            t2, 4(s2)
+    addiu          s2, 8
+    bgtz           t9, 1b
+     addiu         t7, 4
+11:
+    andi           t8, 3
+    beqz           t8, 3f
+     addiu         t7, 1
+2:
+    lbu            t0, 0(t7)
+    addiu          t7, 1
+    sll            t1, t0, 1
+    addu           t2, t0, t1  // t2 = invalue
+    lbu            t3, -2(t7)
+    lbu            t4, 0(t7)
+    addiu          t3, 1
+    addiu          t4, 2
+    addu           t3, t3, t2
+    addu           t4, t4, t2
+    srl            t3, 2
+    srl            t4, 2
+    sb             t3, 0(s2)
+    sb             t4, 1(s2)
+    addiu          t8, -1
+    bgtz           t8, 2b
+     addiu         s2, 2
+
+    lbu            t0, 0(t7)
+    lbu            t2, -1(t7)
+    sll            t1, t0, 1
+    addu           t1, t1, t0 // t1 = invalue * 3
+    addu           t1, t1, t2
+    addiu          t1, 1
+    srl            t1, t1, 2
+    sb             t1, 0(s2)
+    sb             t0, 1(s2)
+    addiu          s1, 4
+    bne            s1, s0, 0b
+     addiu         a2, 4
+3:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j              ra
+     nop
+END(jsimd_h2v1_fancy_upsample_mips_dspr2)
+
+/*****************************************************************************/
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - cinfo->max_v_samp_factor
+ * a2     - compptr->v_samp_factor
+ * a3     - compptr->width_in_blocks
+ * 16(sp) - input_data
+ * 20(sp) - output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+    beqz        a2, 7f
+     lw         s1, 44(sp)  // s1 = output_data
+    lw          s0, 40(sp)  // s0 = input_data
+    srl         s2, a0, 2
+    andi        t9, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3   // t0 = width_in_blocks*DCT
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    andi        t6, a0, 1   // t6 = temp_index
+    addiu       t6, -1
+    lw          t4, 0(s1)   // t4 = outptr
+    lw          t5, 0(s0)   // t5 = inptr0
+    li          s3, 0       // s3 = bias
+    srl         t7, a0, 1   // t7 = image_width1
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+1:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 2(t5)
+    ulhu        t2, 4(t5)
+    ulhu        t3, 6(t5)
+    raddu.w.qb  t0, t0
+    raddu.w.qb  t1, t1
+    raddu.w.qb  t2, t2
+    raddu.w.qb  t3, t3
+    shra.ph     t0, t0, 1
+    shra_r.ph   t1, t1, 1
+    shra.ph     t2, t2, 1
+    shra_r.ph   t3, t3, 1
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t3, 3(t4)
+    addiu       s4, -1
+    addiu       t4, 4
+    bgtz        s4, 1b
+     addiu      t5, 8
+    beqz        t8, 3f
+     addu       s4, t4, t8
+2:
+    ulhu        t0, 0(t5)
+    raddu.w.qb  t0, t0
+    addqh.w     t0, t0, s3
+    xori        s3, s3, 1
+    sb          t0, 0(t4)
+    addiu       t4, 1
+    bne         t4, s4, 2b
+     addiu      t5, 2
+3:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    addqh.w     t2, t1, s3  // t2 = pixval1
+    xori        s3, s3, 1
+    addqh.w     t3, t1, s3  // t3 = pixval2
+    blez        s2, 5f
+     append     t3, t2,  8
+    addu        t5, t4, s2  // t5 = loop_end2
+4:
+    ush         t3, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 4b
+     addiu      t4,  2
+5:
+    beqz        t9, 6f
+     nop
+    sb          t2, 0(t4)
+6:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 4
+7:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+    j           ra
+    nop
+END(jsimd_h2v1_downsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
+
+/*
+ * a0     - cinfo->image_width
+ * a1     - cinfo->max_v_samp_factor
+ * a2     - compptr->v_samp_factor
+ * a3     - compptr->width_in_blocks
+ * 16(sp) - input_data
+ * 20(sp) - output_data
+ */
+    .set at
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    beqz         a2, 8f
+     lw          s1, 52(sp)      // s1 = output_data
+    lw           s0, 48(sp)      // s0 = input_data
+
+    andi         t6, a0, 1       // t6 = temp_index
+    addiu        t6, -1
+    srl          t7, a0, 1       // t7 = image_width1
+    srl          s4, t7, 2
+    andi         t8, t7, 3
+    andi         t9, a0, 2
+    srl          s2, a0, 2
+    srl          t7, t9, 1
+    addu         s2, t7, s2
+    sll          t0, a3, 3       // s2 = width_in_blocks*DCT
+    srl          t7, t0, 1
+    subu         s2, t7, s2
+0:
+    lw           t4, 0(s1)       // t4 = outptr
+    lw           t5, 0(s0)       // t5 = inptr0
+    lw           s7, 4(s0)       // s7 = inptr1
+    li           s6, 1           // s6 = bias
+2:
+    ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
+    ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
+    ulw          t2, 4(t5)
+    ulw          t3, 4(s7)
+    precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
+    ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
+    raddu.w.qb   t1, t7
+    raddu.w.qb   t0, t0
+    shra_r.w     t1, t1, 2
+    addiu        t0, 1
+    srl          t0, 2
+    precrq.ph.w  t7, t2, t3
+    ins          t2, t3, 16, 16
+    raddu.w.qb   t7, t7
+    raddu.w.qb   t2, t2
+    shra_r.w     t7, t7, 2
+    addiu        t2, 1
+    srl          t2, 2
+    sb           t0, 0(t4)
+    sb           t1, 1(t4)
+    sb           t2, 2(t4)
+    sb           t7, 3(t4)
+    addiu        t4, 4
+    addiu        t5, 8
+    addiu        s4, s4, -1
+    bgtz         s4, 2b
+     addiu       s7, 8
+    beqz         t8, 4f
+     addu        t8, t4, t8
+3:
+    ulhu         t0, 0(t5)
+    ulhu         t1, 0(s7)
+    ins          t0, t1, 16, 16
+    raddu.w.qb   t0, t0
+    addu         t0, t0, s6
+    srl          t0, 2
+    xori         s6, s6, 3
+    sb           t0, 0(t4)
+    addiu        t5, 2
+    addiu        t4, 1
+    bne          t8, t4, 3b
+     addiu       s7, 2
+4:
+    lbux         t1, t6(t5)
+    sll          t1, 1
+    lbux         t0, t6(s7)
+    sll          t0, 1
+    addu         t1, t1, t0
+    addu         t3, t1, s6
+    srl          t0, t3, 2       // t2 = pixval1
+    xori         s6, s6, 3
+    addu         t2, t1, s6
+    srl          t1, t2, 2       // t3 = pixval2
+    blez         s2, 6f
+     append      t1, t0, 8
+5:
+    ush          t1, 0(t4)
+    addiu        s2, -1
+    bgtz         s2, 5b
+     addiu       t4, 2
+6:
+    beqz         t9, 7f
+     nop
+    sb           t0, 0(t4)
+7:
+    addiu        s1, 4
+    addiu        a2, -1
+    bnez         a2, 0b
+     addiu       s0, 8
+8:
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j            ra
+     nop
+END(jsimd_h2v2_downsample_mips_dspr2)
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - cinfo->output_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+    lw      t7, 0(a3)       // t7 = output_data
+    andi    t8, a1, 0xf     // t8 = residual
+    sll     t0, a0, 2
+    beqz    a0, 4f
+     addu   t9, t7, t0      // t9 = output_data end address
+0:
+    lw      t5, 0(t7)       // t5 = outptr
+    lw      t6, 0(a2)       // t6 = inptr
+    addu    t3, t5, a1      // t3 = outptr + output_width (end address)
+    subu    t3, t8          // t3 = end address - residual
+    beqz    t3, 2f
+     nop
+1:
+    ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
+    ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
+    srl     t1, t0, 16      // t1 = |X|X|P3|P2|
+    ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
+    ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
+    ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
+    ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
+    usw     t0, 0(t5)
+    usw     t1, 4(t5)
+    srl     t0, t2, 16      // t0 = |X|X|P7|P6|
+    ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
+    ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
+    ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
+    ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
+    usw     t2, 8(t5)
+    usw     t0, 12(t5)
+    addiu   t5, 16
+    bne     t5, t3, 1b
+     addiu  t6, 8
+    beqz    t8, 3f
+     move   t4, t8
+2:
+    lbu     t1, 0(t6)
+    sb      t1, 0(t5)
+    sb      t1, 1(t5)
+    addiu   t4, -2
+    addiu   t6, 1
+    bgtz    t4, 2b
+     addiu  t5, 2
+3:
+    addiu   t7, 4
+    bne     t9, t7, 0b
+     addiu  a2, 4
+4:
+    j       ra
+     nop
+END(jsimd_h2v1_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - cinfo->output_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+    lw      t7, 0(a3)
+    beqz    a0, 7f
+     andi   t9, a1, 0xf     // t9 = residual
+0:
+    lw      t6, 0(a2)       // t6 = inptr
+    lw      t5, 0(t7)       // t5 = outptr
+    addu    t8, t5, a1      // t8 = outptr end address
+    subu    t8, t9          // t8 = end address - residual
+    beqz    t8, 2f
+     nop
+1:
+    ulw     t0, 0(t6)
+    srl     t1, t0, 16
+    ins     t0, t0, 16, 16
+    ins     t0, t0, 8, 16
+    ins     t1, t1, 16, 16
+    ins     t1, t1, 8, 16
+    ulw     t2, 4(t6)
+    usw     t0, 0(t5)
+    usw     t1, 4(t5)
+    srl     t3, t2, 16
+    ins     t2, t2, 16, 16
+    ins     t2, t2, 8, 16
+    ins     t3, t3, 16, 16
+    ins     t3, t3, 8, 16
+    usw     t2, 8(t5)
+    usw     t3, 12(t5)
+    addiu   t5, 16
+    bne     t5, t8, 1b
+     addiu  t6, 8
+    beqz    t9, 3f
+     move   t4, t9
+2:
+    lbu     t0, 0(t6)
+    sb      t0, 0(t5)
+    sb      t0, 1(t5)
+    addiu   t4, -2
+    addiu   t6, 1
+    bgtz    t4, 2b
+     addiu  t5, 2
+3:
+    ulw     t6, 0(t7)       // t6 = outptr
+    ulw     t5, 4(t7)       // t5 = outptr[1]
+    addu    t4, t6, a1      // t4 = new end address
+    subu    t8, t4, t9
+    beqz    t8, 5f
+     nop
+4:
+    ulw     t0, 0(t6)
+    ulw     t1, 4(t6)
+    ulw     t2, 8(t6)
+    usw     t0, 0(t5)
+    ulw     t0, 12(t6)
+    usw     t1, 4(t5)
+    usw     t2, 8(t5)
+    usw     t0, 12(t5)
+    addiu   t6, 16
+    bne     t6, t8, 4b
+     addiu  t5, 16
+    beqz    t9, 6f
+     nop
+5:
+    lbu     t0, 0(t6)
+    sb      t0, 0(t5)
+    addiu   t6, 1
+    bne     t6, t4, 5b
+     addiu  t5, 1
+6:
+    addiu   t7, 8
+    addiu   a0, -2
+    bgtz    a0, 0b
+     addiu  a2, 4
+7:
+    j       ra
+     nop
+END(jsimd_h2v2_upsample_mips_dspr2)
+
+/*****************************************************************************/
diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h
new file mode 100644
index 0000000..53cf2bc
--- /dev/null
+++ b/simd/jsimd_mips_dspr2_asm.h
@@ -0,0 +1,252 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero $0
+#define AT   $1
+#define v0   $2
+#define v1   $3
+#define a0   $4
+#define a1   $5
+#define a2   $6
+#define a3   $7
+#define t0   $8
+#define t1   $9
+#define t2   $10
+#define t3   $11
+#define t4   $12
+#define t5   $13
+#define t6   $14
+#define t7   $15
+#define s0   $16
+#define s1   $17
+#define s2   $18
+#define s3   $19
+#define s4   $20
+#define s5   $21
+#define s6   $22
+#define s7   $23
+#define t8   $24
+#define t9   $25
+#define k0   $26
+#define k1   $27
+#define gp   $28
+#define sp   $29
+#define fp   $30
+#define s8   $30
+#define ra   $31
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol)                           \
+                .globl  symbol;                         \
+                .align  2;                              \
+                .type   symbol, @function;              \
+                .ent    symbol, 0;                      \
+symbol:         .frame  sp, 0, ra;                      \
+                .set    push;                           \
+                .set    arch=mips32r2;                  \
+                .set    noreorder;                      \
+                .set    noat;
+
+/*
+ * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_MIPS_DSPR2(symbol)                         \
+LEAF_MIPS32R2(symbol)                                   \
+                .set    dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function)                                   \
+                .set    pop;                            \
+                .end    function;                       \
+                .size   function,.-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+                          r2  = 0, r3  = 0, r4  = 0, \
+                          r5  = 0, r6  = 0, r7  = 0, \
+                          r8  = 0, r9  = 0, r10 = 0, \
+                          r11 = 0, r12 = 0, r13 = 0, \
+                          r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, -\stack_offset
+    .endif
+    sw              \r1, 0(sp)
+    .if \r2 != 0
+    sw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    sw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    sw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw              \r14, 52(sp)
+    .endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+                               r2  = 0, r3  = 0, r4  = 0, \
+                               r5  = 0, r6  = 0, r7  = 0, \
+                               r8  = 0, r9  = 0, r10 = 0, \
+                               r11 = 0, r12 = 0, r13 = 0, \
+                               r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    lw              \r1, 0(sp)
+    .if \r2 != 0
+    lw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    lw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    lw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw              \r14, 52(sp)
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, \stack_offset
+    .endif
+.endm
+
+
diff --git a/tjbench.c b/tjbench.c
index 105a4cc..b2ef8d6 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -54,15 +54,31 @@
 };
 const char *subNameLong[TJ_NUMSAMP]=
 {
-	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
 };
-const char *subName[NUMSUBOPT]={"444", "422", "420", "GRAY", "440"};
+const char *csName[TJ_NUMCS]=
+{
+	"RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
+};
+const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440", "411"};
 tjscalingfactor *scalingfactors=NULL, sf={1, 1};  int nsf=0;
 int xformop=TJXOP_NONE, xformopt=0;
 int (*customFilter)(short *, tjregion, tjregion, int, int, tjtransform *);
 double benchtime=5.0;
 
 
+char *formatName(int subsamp, int cs, char *buf)
+{
+	if(cs==TJCS_YCbCr) return (char *)subNameLong[subsamp];
+	else if(cs==TJCS_YCCK)
+	{
+		snprintf(buf, 80, "%s %s", csName[cs], subNameLong[subsamp]);
+		return buf;
+	}
+	else return (char *)csName[cs];
+}
+
+
 char *sigfig(double val, int figs, char *buf, int len)
 {
 	char format[80];
@@ -466,7 +482,7 @@
 	unsigned char **jpegbuf=NULL, *srcbuf=NULL;
 	unsigned long *jpegsize=NULL, srcsize, totaljpegsize;
 	tjtransform *t=NULL;
-	int w=0, h=0, subsamp=-1, _w, _h, _tilew, _tileh,
+	int w=0, h=0, subsamp=-1, cs=-1, _w, _h, _tilew, _tileh,
 		_ntilesw, _ntilesh, _subsamp;
 	char *temp=NULL, tempstr[80], tempstr2[80];
 	int row, col, i, tilew, tileh, ntilesw=1, ntilesh=1, retval=0;
@@ -490,20 +506,25 @@
 
 	if((handle=tjInitTransform())==NULL)
 		_throwtj("executing tjInitTransform()");
-	if(tjDecompressHeader2(handle, srcbuf, srcsize, &w, &h, &subsamp)==-1)
-		_throwtj("executing tjDecompressHeader2()");
+	if(tjDecompressHeader3(handle, srcbuf, srcsize, &w, &h, &subsamp, &cs)==-1)
+		_throwtj("executing tjDecompressHeader3()");
 
 	if(quiet==1)
 	{
 		printf("All performance values in Mpixels/sec\n\n");
-		printf("Bitmap\tBitmap\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
+		printf("Bitmap\tBitmap\tJPEG\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
 			dotile? "Tile ":"Image", dotile? "Tile ":"Image");
-		printf("Format\tOrder\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n\n");
+		printf("Format\tOrder\tCS\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n\n");
 	}
 	else if(!quiet)
 	{
-		printf(">>>>>  JPEG %s --> %s (%s)  <<<<<\n", subNameLong[subsamp],
-			pixFormatStr[pf], (flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down");
+		if(yuv==YUVDECODE)
+			printf(">>>>>  JPEG %s --> YUV  <<<<<\n",
+				formatName(subsamp, cs, tempstr));
+		else
+			printf(">>>>>  JPEG %s --> %s (%s)  <<<<<\n",
+				formatName(subsamp, cs, tempstr), pixFormatStr[pf],
+				(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down");
 	}
 
 	for(tilew=dotile? 16:w, tileh=dotile? 16:h; ; tilew*=2, tileh*=2)
@@ -539,8 +560,8 @@
 		}
 		else if(quiet==1)
 		{
-			printf("%s\t%s\t%s\t", pixFormatStr[pf],
-				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subNameLong[subsamp]);
+			printf("%s\t%s\t%s\t%s\t", pixFormatStr[pf],
+				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", csName[cs], subNameLong[subsamp]);
 			printf("%-4d  %-4d\t", tilew, tileh);
 		}
 
@@ -686,6 +707,7 @@
 	printf("     codec\n");
 	printf("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the\n");
 	printf("     underlying codec\n");
+	printf("-411 = Test 4:1:1 chrominance subsampling instead of 4:2:0\n");
 	printf("-440 = Test 4:4:0 chrominance subsampling instead of 4:2:2\n");
 	printf("-quiet = Output results in tabular rather than verbose format\n");
 	printf("-yuvencode = Encode RGB input as planar YUV rather than compressing as JPEG\n");
@@ -720,7 +742,7 @@
 {
 	unsigned char *srcbuf=NULL;  int w, h, i, j;
 	int minqual=-1, maxqual=-1;  char *temp;
-	int minarg=2, retval=0, do440=0;
+	int minarg=2, retval=0, do440=0, do411=0;
 
 	if((scalingfactors=tjGetScalingFactors(&nsf))==NULL || nsf==0)
 		_throwtj("executing tjGetScalingFactors()");
@@ -858,6 +880,7 @@
 			if(!strcmp(argv[i], "-?")) usage(argv[0]);
 			if(!strcasecmp(argv[i], "-alloc")) flags&=(~TJFLAG_NOREALLOC);
 			if(!strcasecmp(argv[i], "-bmp")) ext="bmp";
+			if(!strcasecmp(argv[i], "-411")) do411=1;
 		}
 	}
 
@@ -901,7 +924,7 @@
 		dotest(srcbuf, w, h, TJ_GRAYSCALE, i, argv[1]);
 	printf("\n");
 	for(i=maxqual; i>=minqual; i--)
-		dotest(srcbuf, w, h, TJ_420, i, argv[1]);
+		dotest(srcbuf, w, h, do411? TJSAMP_411:TJ_420, i, argv[1]);
 	printf("\n");
 	for(i=maxqual; i>=minqual; i--)
 		dotest(srcbuf, w, h, do440? TJSAMP_440:TJ_422, i, argv[1]);
diff --git a/tjunittest.c b/tjunittest.c
index 6f23b23..b738c6c 100644
--- a/tjunittest.c
+++ b/tjunittest.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2012 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2013 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,8 @@
 	printf("\nUSAGE: %s [options]\n", progName);
 	printf("Options:\n");
 	printf("-yuv = test YUV encoding/decoding support\n");
+	printf("-noyuvpad = do not pad each line of each Y, U, and V plane to the nearest\n");
+	printf("            4-byte boundary\n");
 	printf("-alloc = test automatic buffer allocation\n");
 	exit(1);
 }
@@ -59,25 +61,26 @@
 
 const char *subNameLong[TJ_NUMSAMP]=
 {
-	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
 };
-const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440"};
+const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440", "411"};
 
 const char *pixFormatStr[TJ_NUMPF]=
 {
 	"RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "Grayscale",
-	"RGBA", "BGRA", "ABGR", "ARGB"
+	"RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
 };
 
-const int alphaOffset[TJ_NUMPF] = {-1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0};
+const int alphaOffset[TJ_NUMPF] = {-1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1};
 
 const int _3byteFormats[]={TJPF_RGB, TJPF_BGR};
-const int _4byteFormats[]={TJPF_RGBX, TJPF_BGRX, TJPF_XBGR, TJPF_XRGB};
+const int _4byteFormats[]={TJPF_RGBX, TJPF_BGRX, TJPF_XBGR, TJPF_XRGB,
+	TJPF_CMYK};
 const int _onlyGray[]={TJPF_GRAY};
 const int _onlyRGB[]={TJPF_RGB};
 
 enum {YUVENCODE=1, YUVDECODE};
-int yuv=0, alloc=0;
+int yuv=0, alloc=0, pad=4;
 
 int exitStatus=0;
 #define bailout() {exitStatus=-1;  goto bailout;}
@@ -91,9 +94,9 @@
 	int ps=tjPixelSize[pf];
 	int index, row, col, halfway=16;
 
-	memset(buf, 0, w*h*ps);
 	if(pf==TJPF_GRAY)
 	{
+		memset(buf, 0, w*h*ps);
 		for(row=0; row<h; row++)
 		{
 			for(col=0; col<w; col++)
@@ -105,8 +108,30 @@
 			}
 		}
 	}
+	else if(pf==TJPF_CMYK)
+	{
+		memset(buf, 255, w*h*ps);
+		for(row=0; row<h; row++)
+		{
+			for(col=0; col<w; col++)
+			{
+				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
+				else index=row*w+col;
+				if(((row/8)+(col/8))%2==0)
+				{
+					if(row>=halfway) buf[index*ps+3]=0;
+				}
+				else
+				{
+					buf[index*ps+2]=0;
+					if(row<halfway) buf[index*ps+1]=0;
+				}
+			}
+		}
+	}
 	else
 	{
+		memset(buf, 0, w*h*ps);
 		for(row=0; row<h; row++)
 		{
 			for(col=0; col<w; col++)
@@ -165,6 +190,36 @@
 	int halfway=16*sf.num/sf.denom;
 	int blocksize=8*sf.num/sf.denom;
 
+	if(pf==TJPF_CMYK)
+	{
+		for(row=0; row<h; row++)
+		{
+			for(col=0; col<w; col++)
+			{
+				unsigned char c, m, y, k;
+				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
+				else index=row*w+col;
+				c=buf[index*ps];
+				m=buf[index*ps+1];
+				y=buf[index*ps+2];
+				k=buf[index*ps+3];
+				if(((row/blocksize)+(col/blocksize))%2==0)
+				{
+					checkval255(c);  checkval255(m);  checkval255(y);
+					if(row<halfway) checkval255(k)
+					else checkval0(k)
+				}
+				else
+				{
+					checkval255(c);  checkval0(y);  checkval255(k);
+					if(row<halfway) checkval0(m)
+					else checkval255(m)
+				}
+			}
+		}
+		return 1;
+	}
+
 	for(row=0; row<h; row++)
 	{
 		for(col=0; col<w; col++)
@@ -223,8 +278,13 @@
 		{
 			for(col=0; col<w; col++)
 			{
-				printf("%.3d/%.3d/%.3d ", buf[(row*w+col)*ps+roffset],
-					buf[(row*w+col)*ps+goffset], buf[(row*w+col)*ps+boffset]);
+				if(pf==TJPF_CMYK)
+					printf("%.3d/%.3d/%.3d/%.3d ", buf[(row*w+col)*ps],
+						buf[(row*w+col)*ps+1], buf[(row*w+col)*ps+2],
+						buf[(row*w+col)*ps+3]);
+				else
+					printf("%.3d/%.3d/%.3d ", buf[(row*w+col)*ps+roffset],
+						buf[(row*w+col)*ps+goffset], buf[(row*w+col)*ps+boffset]);
 			}
 			printf("\n");
 		}
@@ -235,22 +295,24 @@
 
 #define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
 
-int checkBufYUV(unsigned char *buf, int w, int h, int subsamp)
+int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
+	tjscalingfactor sf)
 {
 	int row, col;
 	int hsf=tjMCUWidth[subsamp]/8, vsf=tjMCUHeight[subsamp]/8;
 	int pw=PAD(w, hsf), ph=PAD(h, vsf);
 	int cw=pw/hsf, ch=ph/vsf;
-	int ypitch=PAD(pw, 4), uvpitch=PAD(cw, 4);
+	int ypitch=PAD(pw, pad), uvpitch=PAD(cw, pad);
 	int retval=1;
-	int halfway=16;
+	int halfway=16*sf.num/sf.denom;
+	int blocksize=8*sf.num/sf.denom;
 
 	for(row=0; row<ph; row++)
 	{
 		for(col=0; col<pw; col++)
 		{
 			unsigned char y=buf[ypitch*row+col];
-			if(((row/8)+(col/8))%2==0)
+			if(((row/blocksize)+(col/blocksize))%2==0)
 			{
 				if(row<halfway) checkval255(y)  else checkval0(y);
 			}
@@ -262,14 +324,14 @@
 	}
 	if(subsamp!=TJSAMP_GRAY)
 	{
-		halfway=16/vsf;
+		int halfway=16/vsf*sf.num/sf.denom;
 		for(row=0; row<ch; row++)
 		{
 			for(col=0; col<cw; col++)
 			{
 				unsigned char u=buf[ypitch*ph + (uvpitch*row+col)],
 					v=buf[ypitch*ph + uvpitch*ch + (uvpitch*row+col)];
-				if(((row*vsf/8)+(col*hsf/8))%2==0)
+				if(((row*vsf/blocksize)+(col*hsf/blocksize))%2==0)
 				{
 					checkval(u, 128);  checkval(v, 128);
 				}
@@ -354,14 +416,15 @@
 	t=gettime();
 	if(yuv==YUVENCODE)
 	{
-		_tj(tjEncodeYUV2(handle, srcBuf, w, 0, h, pf, *dstBuf, subsamp, flags));
+		_tj(tjEncodeYUV3(handle, srcBuf, w, 0, h, pf, *dstBuf, pad, subsamp,
+			flags));
 	}
 	else
 	{
 		if(!alloc)
 		{
 			flags|=TJFLAG_NOREALLOC;
-			*dstSize=(yuv==YUVENCODE? tjBufSizeYUV(w, h, subsamp)
+			*dstSize=(yuv==YUVENCODE? tjBufSizeYUV2(w, pad, h, subsamp)
 				: tjBufSize(w, h, subsamp));
 		}
 		_tj(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
@@ -379,7 +442,8 @@
 	writeJPEG(*dstBuf, *dstSize, tempStr);
 	if(yuv==YUVENCODE)
 	{
-		if(checkBufYUV(*dstBuf, w, h, subsamp)) printf("Passed.");
+		tjscalingfactor sf={1, 1};
+		if(checkBufYUV(*dstBuf, w, h, subsamp, sf)) printf("Passed.");
 		else printf("FAILED!");
 	}
 	else printf("Done.");
@@ -403,22 +467,21 @@
 	if(yuv==YUVENCODE) return;
 
 	if(yuv==YUVDECODE)
-		printf("JPEG -> YUV %s ... ", subNameLong[subsamp]);
+		printf("JPEG -> YUV %s ", subNameLong[subsamp]);
 	else
-	{
 		printf("JPEG -> %s %s ", pixFormatStr[pf],
 			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ");
-		if(sf.num!=1 || sf.denom!=1)
-			printf("%d/%d ... ", sf.num, sf.denom);
-		else printf("... ");
-	}
+	if(sf.num!=1 || sf.denom!=1)
+		printf("%d/%d ... ", sf.num, sf.denom);
+	else printf("... ");
 
 	_tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
 		&_hdrsubsamp));
 	if(_hdrw!=w || _hdrh!=h || _hdrsubsamp!=subsamp)
 		_throw("Incorrect JPEG header");
 
-	if(yuv==YUVDECODE) dstSize=tjBufSizeYUV(w, h, subsamp);
+	if(yuv==YUVDECODE)
+		dstSize=tjBufSizeYUV2(scaledWidth, pad, scaledHeight, subsamp);
 	else dstSize=scaledWidth*scaledHeight*tjPixelSize[pf];
 	if((dstBuf=(unsigned char *)malloc(dstSize))==NULL)
 		_throw("Memory allocation failure");
@@ -427,7 +490,8 @@
 	t=gettime();
 	if(yuv==YUVDECODE)
 	{
-		_tj(tjDecompressToYUV(handle, jpegBuf, jpegSize, dstBuf, flags));
+		_tj(tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth,
+			pad, scaledHeight, flags));
 	}
 	else
 	{
@@ -438,7 +502,8 @@
 
 	if(yuv==YUVDECODE)
 	{
-		if(checkBufYUV(dstBuf, w, h, subsamp)) printf("Passed.");
+		if(checkBufYUV(dstBuf, scaledWidth, scaledHeight, subsamp, sf))
+			printf("Passed.");
 		else printf("FAILED!");
 	}
 	else
@@ -459,18 +524,19 @@
 	int flags)
 {
 	int i, n=0;
-	tjscalingfactor *sf=tjGetScalingFactors(&n), sf1={1, 1};
+	tjscalingfactor *sf=tjGetScalingFactors(&n);
 	if(!sf || !n) _throwtj();
 
-	if((subsamp==TJSAMP_444 || subsamp==TJSAMP_GRAY) && !yuv)
+	for(i=0; i<n; i++)
 	{
-		for(i=0; i<n; i++)
+		if(subsamp==TJSAMP_444 || subsamp==TJSAMP_GRAY ||
+			(subsamp==TJSAMP_411 && sf[i].num==1 &&
+				(sf[i].denom==2 || sf[i].denom==1)) ||
+			(subsamp!=TJSAMP_411 && sf[i].num==1 &&
+				(sf[i].denom==4 || sf[i].denom==2 || sf[i].denom==1)))
 			_decompTest(handle, jpegBuf, jpegSize, w, h, pf, basename, subsamp,
 				flags, sf[i]);
 	}
-	else
-		_decompTest(handle, jpegBuf, jpegSize, w, h, pf, basename, subsamp, flags,
-			sf1);
 
 	bailout:
 	return;
@@ -486,7 +552,7 @@
 
 	if(!alloc)
 	{
-		size=(yuv==YUVENCODE? tjBufSizeYUV(w, h, subsamp)
+		size=(yuv==YUVENCODE? tjBufSizeYUV2(w, pad, h, subsamp)
 			: tjBufSize(w, h, subsamp));
 		if((dstBuf=(unsigned char *)tjAlloc(size))==NULL)
 			_throw("Memory allocation failure.");
@@ -500,7 +566,8 @@
 		for(i=0; i<2; i++)
 		{
 			int flags=0;
-			if(subsamp==TJSAMP_422 || subsamp==TJSAMP_420 || subsamp==TJSAMP_440)
+			if(subsamp==TJSAMP_422 || subsamp==TJSAMP_420 || subsamp==TJSAMP_440 ||
+				subsamp==TJSAMP_411)
 				flags|=TJFLAG_FASTUPSAMPLE;
 			if(i==1)
 			{
@@ -601,7 +668,7 @@
 
 int main(int argc, char *argv[])
 {
-	int doyuv=0, i;
+	int doyuv=0, i, num4bf=5;
 	#ifdef _WIN32
 	srand((unsigned int)time(NULL));
 	#endif
@@ -610,24 +677,27 @@
 		for(i=1; i<argc; i++)
 		{
 			if(!strcasecmp(argv[i], "-yuv")) doyuv=1;
+			if(!strcasecmp(argv[i], "-noyuvpad")) pad=1;
 			if(!strcasecmp(argv[i], "-alloc")) alloc=1;
 			if(!strncasecmp(argv[i], "-h", 2) || !strcasecmp(argv[i], "-?"))
 				usage(argv[0]);
 		}
 	}
 	if(alloc) printf("Testing automatic buffer allocation\n");
-	if(doyuv) {yuv=YUVENCODE;  alloc=0;}
+	if(doyuv) {yuv=YUVENCODE;  alloc=0;  num4bf=4;}
 	doTest(35, 39, _3byteFormats, 2, TJSAMP_444, "test");
-	doTest(39, 41, _4byteFormats, 4, TJSAMP_444, "test");
+	doTest(39, 41, _4byteFormats, num4bf, TJSAMP_444, "test");
 	doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
-	doTest(35, 39, _4byteFormats, 4, TJSAMP_422, "test");
+	doTest(35, 39, _4byteFormats, num4bf, TJSAMP_422, "test");
 	doTest(39, 41, _3byteFormats, 2, TJSAMP_420, "test");
-	doTest(41, 35, _4byteFormats, 4, TJSAMP_420, "test");
+	doTest(41, 35, _4byteFormats, num4bf, TJSAMP_420, "test");
 	doTest(35, 39, _3byteFormats, 2, TJSAMP_440, "test");
-	doTest(39, 41, _4byteFormats, 4, TJSAMP_440, "test");
-	doTest(35, 39, _onlyGray, 1, TJSAMP_GRAY, "test");
-	doTest(39, 41, _3byteFormats, 2, TJSAMP_GRAY, "test");
-	doTest(41, 35, _4byteFormats, 4, TJSAMP_GRAY, "test");
+	doTest(39, 41, _4byteFormats, num4bf, TJSAMP_440, "test");
+	doTest(41, 35, _3byteFormats, 2, TJSAMP_411, "test");
+	doTest(35, 39, _4byteFormats, num4bf, TJSAMP_411, "test");
+	doTest(39, 41, _onlyGray, 1, TJSAMP_GRAY, "test");
+	doTest(41, 35, _3byteFormats, 2, TJSAMP_GRAY, "test");
+	doTest(35, 39, _4byteFormats, 4, TJSAMP_GRAY, "test");
 	if(!doyuv) bufSizeTest();
 	if(doyuv)
 	{
@@ -640,10 +710,12 @@
 		doTest(41, 35, _onlyRGB, 1, TJSAMP_420, "test_yuv1");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_440, "test_yuv0");
 		doTest(35, 39, _onlyRGB, 1, TJSAMP_440, "test_yuv1");
+		doTest(48, 48, _onlyRGB, 1, TJSAMP_411, "test_yuv0");
+		doTest(39, 41, _onlyRGB, 1, TJSAMP_411, "test_yuv1");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv0");
-		doTest(35, 39, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv1");
+		doTest(41, 35, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv1");
 		doTest(48, 48, _onlyGray, 1, TJSAMP_GRAY, "test_yuv0");
-		doTest(39, 41, _onlyGray, 1, TJSAMP_GRAY, "test_yuv1");
+		doTest(35, 39, _onlyGray, 1, TJSAMP_GRAY, "test_yuv1");
 	}
 
 	return exitStatus;
diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
index 634bedf..09c557b 100644
--- a/turbojpeg-jni.c
+++ b/turbojpeg-jni.c
@@ -67,16 +67,23 @@
 	return retval;
 }
 
-JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV
-	(JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
+	(JNIEnv *env, jclass cls, jint width, jint pad, jint height, jint subsamp)
 {
-	jint retval=(jint)tjBufSizeYUV(width, height, subsamp);
+	jint retval=(jint)tjBufSizeYUV2(width, pad, height, subsamp);
 	if(retval==-1) _throw(tjGetErrorStr());
 
 	bailout:
 	return retval;
 }
 
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III
+	(JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
+{
+	return Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII(env, cls, width,
+		4, height, subsamp);
+}
+
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
 	(JNIEnv *env, jobject obj)
 {
@@ -207,12 +214,12 @@
 		flags);
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
-		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+		jint height, jint pf, jbyteArray dst, jint pad, jint subsamp, jint flags)
 {
 	tjhandle handle=0;
-	jsize arraySize=0;
+	jsize arraySize=0, yuvSize;
 	unsigned char *srcBuf=NULL, *dstBuf=NULL;
 
 	gethandle();
@@ -226,15 +233,17 @@
 	arraySize=(pitch==0)? width*tjPixelSize[pf]*height:pitch*height;
 	if((*env)->GetArrayLength(env, src)<arraySize)
 		_throw("Source buffer is not large enough");
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(width, height, subsamp))
+	yuvSize=(jsize)tjBufSizeYUV2(width, pad, height, subsamp);
+	if(yuvSize==(unsigned long)-1)
+		_throw(tjGetErrorStr());
+	if((*env)->GetArrayLength(env, dst)<yuvSize)
 		_throw("Destination buffer is not large enough");
 
 	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
-		flags)==-1)
+	if(tjEncodeYUV3(handle, srcBuf, width, pitch, height, pf, dstBuf, pad,
+		subsamp, flags)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
 		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
@@ -248,12 +257,20 @@
 	return;
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
-	(JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
 		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
+	Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BIII(
+		env, obj, src, width, pitch, height, pf, dst, 4, subsamp, flags);
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BIII
+	(JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+		jint height, jint pf, jbyteArray dst, jint pad, jint subsamp, jint flags)
+{
 	tjhandle handle=0;
-	jsize arraySize=0;
+	jsize arraySize=0, yuvSize;
 	unsigned char *srcBuf=NULL, *dstBuf=NULL;
 
 	gethandle();
@@ -269,15 +286,17 @@
 	arraySize=(stride==0)? width*height:stride*height;
 	if((*env)->GetArrayLength(env, src)<arraySize)
 		_throw("Source buffer is not large enough");
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(width, height, subsamp))
+	yuvSize=(jsize)tjBufSizeYUV2(width, pad, height, subsamp);
+	if(yuvSize==(unsigned long)-1)
+		_throw(tjGetErrorStr());
+	if((*env)->GetArrayLength(env, dst)<yuvSize)
 		_throw("Destination buffer is not large enough");
 
 	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjEncodeYUV2(handle, srcBuf, width, stride*sizeof(jint), height, pf,
-		dstBuf, subsamp, flags)==-1)
+	if(tjEncodeYUV3(handle, srcBuf, width, stride*sizeof(jint), height, pf,
+		dstBuf, pad, subsamp, flags)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
 		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
@@ -291,6 +310,14 @@
 	return;
 }
 
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
+	(JNIEnv *env, jobject obj, jintArray src, jint width, jint pitch,
+		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+{
+	Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BIII(
+		env, obj, src, width, pitch, height, pf, dst, 4, subsamp, flags);
+}
+
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
 	(JNIEnv *env, jobject obj)
 {
@@ -355,7 +382,7 @@
 {
 	tjhandle handle=0;
 	unsigned char *jpegBuf=NULL;
-	int width=0, height=0, jpegSubsamp=-1;
+	int width=0, height=0, jpegSubsamp=-1, jpegColorspace=-1;
 
 	gethandle();
 
@@ -364,8 +391,8 @@
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 
-	if(tjDecompressHeader2(handle, jpegBuf, (unsigned long)jpegSize, 
-		&width, &height, &jpegSubsamp)==-1)
+	if(tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize, 
+		&width, &height, &jpegSubsamp, &jpegColorspace)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
 		_throw(tjGetErrorStr());
@@ -374,6 +401,8 @@
 
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
 	(*env)->SetIntField(env, obj, _fid, jpegSubsamp);
+	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegColorspace", "I"));
+	(*env)->SetIntField(env, obj, _fid, jpegColorspace);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
 	(*env)->SetIntField(env, obj, _fid, width);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
@@ -484,13 +513,14 @@
 	
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BIIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
-		jint flags)
+		jint desiredWidth, jint pad, jint desiredHeight, jint flags)
 {
 	tjhandle handle=0;
 	unsigned char *jpegBuf=NULL, *dstBuf=NULL;
 	int jpegSubsamp=-1, jpegWidth=0, jpegHeight=0;
+	jsize yuvSize;
 
 	gethandle();
 
@@ -502,15 +532,18 @@
 	jpegWidth=(int)(*env)->GetIntField(env, obj, _fid);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
 	jpegHeight=(int)(*env)->GetIntField(env, obj, _fid);
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
-		_throw("Destination buffer is not large enough");
 
+	yuvSize=(jsize)tjBufSizeYUV2(desiredWidth==0? jpegWidth:desiredWidth,
+		pad, desiredHeight==0? jpegHeight:desiredHeight, jpegSubsamp);
+	if(yuvSize==(unsigned long)-1)
+		_throw(tjGetErrorStr());
+	if((*env)->GetArrayLength(env, dst)<yuvSize)
+		_throw("Destination buffer is not large enough");
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
-		flags)==-1)
+	if(tjDecompressToYUV2(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
+		desiredWidth, pad, desiredHeight, flags)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
 		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
@@ -524,6 +557,14 @@
 	return;
 }
 
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI
+	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+		jint flags)
+{
+	Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BIIII(
+		env, obj, src, jpegSize, dst, 0, 4, 0, flags);
+}
+
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_init
 	(JNIEnv *env, jobject obj)
 {
diff --git a/turbojpeg-mapfile b/turbojpeg-mapfile
index bd1ac71..46e6ca7 100755
--- a/turbojpeg-mapfile
+++ b/turbojpeg-mapfile
@@ -36,3 +36,12 @@
 		tjInitTransform;
 		tjTransform;
 } TURBOJPEG_1.1;
+
+TURBOJPEG_1.4
+{
+	global:
+		tjBufSizeYUV2;
+		tjDecompressHeader3;
+		tjDecompressToYUV2;
+		tjEncodeYUV3;
+} TURBOJPEG_1.2;
diff --git a/turbojpeg-mapfile.jni b/turbojpeg-mapfile.jni
index ca39c9e..7a5f98e 100755
--- a/turbojpeg-mapfile.jni
+++ b/turbojpeg-mapfile.jni
@@ -36,7 +36,7 @@
 		tjInitTransform;
 		tjTransform;
 		Java_org_libjpegturbo_turbojpeg_TJ_bufSize;
-		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III;
 		Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors;
 		Java_org_libjpegturbo_turbojpeg_TJCompressor_init;
 		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIII_3BIII;
@@ -48,7 +48,7 @@
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader;
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII;
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI;
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy;		
 		Java_org_libjpegturbo_turbojpeg_TJTransformer_init;
 		Java_org_libjpegturbo_turbojpeg_TJTransformer_transform;
@@ -62,3 +62,16 @@
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII;
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII;
 } TURBOJPEG_1.2;
+
+TURBOJPEG_1.4
+{
+	global:
+		tjBufSizeYUV2;
+		tjDecompressHeader3;
+		tjDecompressToYUV2;
+		tjEncodeYUV3;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BIII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BIII;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BIIII;
+} TURBOJPEG_1.3;
diff --git a/turbojpeg.c b/turbojpeg.c
index b10de49..b0acc55 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2012 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2013 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,12 +39,14 @@
 #include "./turbojpeg.h"
 #include "./tjutil.h"
 #include "transupp.h"
+#include "./jpegcomp.h"
 
 extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **,
 	unsigned long *, boolean);
 extern void jpeg_mem_src_tj(j_decompress_ptr, unsigned char *, unsigned long);
 
 #define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
+#define isPow2(x) (((x)&(x-1))==0)
 
 
 /* Error handling (based on example in example.c) */
@@ -85,7 +87,7 @@
 	int init;
 } tjinstance;
 
-static const int pixelsize[TJ_NUMSAMP]={3, 3, 3, 1, 3};
+static const int pixelsize[TJ_NUMSAMP]={3, 3, 3, 1, 3, 3};
 
 static const JXFORM_CODE xformtypes[TJ_NUMXOP]=
 {
@@ -185,6 +187,8 @@
 			cinfo->in_color_space=JCS_RGB;  pixelFormat=TJPF_RGB;
 			break;
 		#endif
+		case TJPF_CMYK:
+			cinfo->in_color_space=JCS_CMYK;  break;
 	}
 
 	cinfo->input_components=tjPixelSize[pixelFormat];
@@ -197,15 +201,20 @@
 	}
 	if(subsamp==TJSAMP_GRAY)
 		jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
-	else
-		jpeg_set_colorspace(cinfo, JCS_YCbCr);
+	else if(pixelFormat==TJPF_CMYK)
+		jpeg_set_colorspace(cinfo, JCS_YCCK);
+	else jpeg_set_colorspace(cinfo, JCS_YCbCr);
 
 	cinfo->comp_info[0].h_samp_factor=tjMCUWidth[subsamp]/8;
 	cinfo->comp_info[1].h_samp_factor=1;
 	cinfo->comp_info[2].h_samp_factor=1;
+	if(cinfo->num_components>3)
+		cinfo->comp_info[3].h_samp_factor=tjMCUWidth[subsamp]/8;
 	cinfo->comp_info[0].v_samp_factor=tjMCUHeight[subsamp]/8;
 	cinfo->comp_info[1].v_samp_factor=1;
 	cinfo->comp_info[2].v_samp_factor=1;
+	if(cinfo->num_components>3)
+		cinfo->comp_info[3].v_samp_factor=tjMCUHeight[subsamp]/8;
 
 	return retval;
 }
@@ -255,6 +264,8 @@
 		case TJPF_ABGR:
 			dinfo->out_color_space=JCS_RGB;  break;
 		#endif
+		case TJPF_CMYK:
+			dinfo->out_color_space=JCS_CMYK;  break;
 		default:
 			_throw("Unsupported pixel format");
 	}
@@ -271,7 +282,10 @@
 	int retval=-1, i, k;
 	for(i=0; i<NUMSUBOPT; i++)
 	{
-		if(dinfo->num_components==pixelsize[i])
+		if(dinfo->num_components==pixelsize[i]
+			|| ((dinfo->jpeg_color_space==JCS_YCCK
+				|| dinfo->jpeg_color_space==JCS_CMYK)
+					&& pixelsize[i]==3 && dinfo->num_components==4))
 		{
 			if(dinfo->comp_info[0].h_samp_factor==tjMCUWidth[i]/8
 				&& dinfo->comp_info[0].v_samp_factor==tjMCUHeight[i]/8)
@@ -279,8 +293,13 @@
 				int match=0;
 				for(k=1; k<dinfo->num_components; k++)
 				{
-					if(dinfo->comp_info[k].h_samp_factor==1
-						&& dinfo->comp_info[k].v_samp_factor==1)
+					int href=1, vref=1;
+					if(dinfo->jpeg_color_space==JCS_YCCK && k==3)
+					{
+						href=tjMCUWidth[i]/8;  vref=tjMCUHeight[i]/8;
+					}
+					if(dinfo->comp_info[k].h_samp_factor==href
+						&& dinfo->comp_info[k].v_samp_factor==vref)
 						match++;
 				}
 				if(match==dinfo->num_components-1)
@@ -531,7 +550,6 @@
 	return retval;
 }
 
-
 DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height)
 {
 	unsigned long retval=0;
@@ -548,22 +566,28 @@
 }
 
 
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
 	int subsamp)
 {
 	unsigned long retval=0;
 	int pw, ph, cw, ch;
-	if(width<1 || height<1 || subsamp<0 || subsamp>=NUMSUBOPT)
-		_throw("tjBufSizeYUV(): Invalid argument");
+	if(width<1 || height<1 || pad<1 || !isPow2(pad) || subsamp<0
+		|| subsamp>=NUMSUBOPT)
+		_throw("tjBufSizeYUV2(): Invalid argument");
 	pw=PAD(width, tjMCUWidth[subsamp]/8);
 	ph=PAD(height, tjMCUHeight[subsamp]/8);
 	cw=pw*8/tjMCUWidth[subsamp];  ch=ph*8/tjMCUHeight[subsamp];
-	retval=PAD(pw, 4)*ph + (subsamp==TJSAMP_GRAY? 0:PAD(cw, 4)*ch*2);
+	retval=PAD(pw, pad)*ph + (subsamp==TJSAMP_GRAY? 0:PAD(cw, pad)*ch*2);
 
 	bailout:
 	return retval;
 }
 
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+	int subsamp)
+{
+	return tjBufSizeYUV2(width, 4, height, subsamp);
+}
 
 DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
 	int subsamp)
@@ -670,9 +694,9 @@
 }
 
 
-DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf,
+DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle, unsigned char *srcBuf,
 	int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf,
-	int subsamp, int flags)
+	int pad, int subsamp, int flags)
 {
 	int i, retval=0;  JSAMPROW *row_pointer=NULL;
 	JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
@@ -688,7 +712,7 @@
 
 	getinstance(handle);
 	if((this->init&COMPRESS)==0)
-		_throw("tjEncodeYUV2(): Instance has not been initialized for compression");
+		_throw("tjEncodeYUV3(): Instance has not been initialized for compression");
 
 	for(i=0; i<MAX_COMPONENTS; i++)
 	{
@@ -697,9 +721,9 @@
 	}
 
 	if(srcBuf==NULL || width<=0 || pitch<0 || height<=0 || pixelFormat<0
-		|| pixelFormat>=TJ_NUMPF || dstBuf==NULL || subsamp<0
-		|| subsamp>=NUMSUBOPT)
-		_throw("tjEncodeYUV2(): Invalid argument");
+		|| pixelFormat>=TJ_NUMPF || dstBuf==NULL || pad<0 || !isPow2(pad)
+		|| subsamp<0 || subsamp>=NUMSUBOPT)
+		_throw("tjEncodeYUV3(): Invalid argument");
 
 	if(setjmp(this->jerr.setjmp_buffer))
 	{
@@ -708,13 +732,16 @@
 		goto bailout;
 	}
 
+	if(pixelFormat==TJPF_CMYK)
+		_throw("tjEncodeYUV3(): Cannot generate YUV images from CMYK pixels");
+
 	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
 
 	#ifndef JCS_EXTENSIONS
 	if(pixelFormat!=TJPF_GRAY)
 	{
 		rgbBuf=(unsigned char *)malloc(width*height*RGB_PIXELSIZE);
-		if(!rgbBuf) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!rgbBuf) _throw("tjEncodeYUV3(): Memory allocation failure");
 		srcBuf=toRGB(srcBuf, width, pitch, height, pixelFormat, rgbBuf);
 		pitch=width*RGB_PIXELSIZE;
 	}
@@ -727,7 +754,7 @@
 	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
 	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	yuvsize=tjBufSizeYUV(width, height, subsamp);
+	yuvsize=tjBufSizeYUV2(width, pad, height, subsamp);
 	jpeg_mem_dest_tj(cinfo, &dstBuf, &yuvsize, 0);
 	if(setCompDefaults(cinfo, pixelFormat, subsamp, -1, flags)==-1) return -1;
 
@@ -736,7 +763,7 @@
 	ph=PAD(height, cinfo->max_v_samp_factor);
 
 	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph))==NULL)
-		_throw("tjEncodeYUV2(): Memory allocation failure");
+		_throw("tjEncodeYUV3(): Memory allocation failure");
 	for(i=0; i<height; i++)
 	{
 		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&srcBuf[(height-i-1)*pitch];
@@ -751,9 +778,9 @@
 		_tmpbuf[i]=(JSAMPLE *)malloc(
 			PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
 				/compptr->h_samp_factor, 16) * cinfo->max_v_samp_factor + 16);
-		if(!_tmpbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!_tmpbuf[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*cinfo->max_v_samp_factor);
-		if(!tmpbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!tmpbuf[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		for(row=0; row<cinfo->max_v_samp_factor; row++)
 		{
 			unsigned char *_tmpbuf_aligned=
@@ -764,9 +791,9 @@
 		}
 		_tmpbuf2[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
 			* compptr->v_samp_factor + 16);
-		if(!_tmpbuf2[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!_tmpbuf2[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		tmpbuf2[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
-		if(!tmpbuf2[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!tmpbuf2[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		for(row=0; row<compptr->v_samp_factor; row++)
 		{
 			unsigned char *_tmpbuf2_aligned=
@@ -777,15 +804,15 @@
 		cw[i]=pw*compptr->h_samp_factor/cinfo->max_h_samp_factor;
 		ch[i]=ph*compptr->v_samp_factor/cinfo->max_v_samp_factor;
 		outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ch[i]);
-		if(!outbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!outbuf[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		for(row=0; row<ch[i]; row++)
 		{
 			outbuf[i][row]=ptr;
-			ptr+=PAD(cw[i], 4);
+			ptr+=PAD(cw[i], pad);
 		}
 	}
 	if(yuvsize!=(unsigned long)(ptr-dstBuf))
-		_throw("tjEncodeYUV2(): Generated image is not the correct size");
+		_throw("tjEncodeYUV3(): Generated image is not the correct size");
 
 	for(row=0; row<ph; row+=cinfo->max_v_samp_factor)
 	{
@@ -817,6 +844,14 @@
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf,
+	int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf,
+	int subsamp, int flags)
+{
+	return tjEncodeYUV3(handle, srcBuf, width, pitch, height, pixelFormat,
+		dstBuf, 4, subsamp, flags);
+}
+
 DLLEXPORT int DLLCALL tjEncodeYUV(tjhandle handle, unsigned char *srcBuf,
 	int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
 	int subsamp, int flags)
@@ -865,19 +900,19 @@
 }
 
 
-DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
 	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-	int *jpegSubsamp)
+	int *jpegSubsamp, int *jpegColorspace)
 {
 	int retval=0;
 
 	getinstance(handle);
 	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressHeader2(): Instance has not been initialized for decompression");
+		_throw("tjDecompressHeader3(): Instance has not been initialized for decompression");
 
 	if(jpegBuf==NULL || jpegSize<=0 || width==NULL || height==NULL
-		|| jpegSubsamp==NULL)
-		_throw("tjDecompressHeader2(): Invalid argument");
+		|| jpegSubsamp==NULL || jpegColorspace==NULL)
+		_throw("tjDecompressHeader3(): Invalid argument");
 
 	if(setjmp(this->jerr.setjmp_buffer))
 	{
@@ -891,18 +926,38 @@
 	*width=dinfo->image_width;
 	*height=dinfo->image_height;
 	*jpegSubsamp=getSubsamp(dinfo);
+	switch(dinfo->jpeg_color_space)
+	{
+		case JCS_GRAYSCALE:  *jpegColorspace=TJCS_GRAY;  break;
+		case JCS_RGB:        *jpegColorspace=TJCS_RGB;  break;
+		case JCS_YCbCr:      *jpegColorspace=TJCS_YCbCr;  break;
+		case JCS_CMYK:       *jpegColorspace=TJCS_CMYK;  break;
+		case JCS_YCCK:       *jpegColorspace=TJCS_YCCK;  break;
+		default:             *jpegColorspace=-1;  break;
+	}
 
 	jpeg_abort_decompress(dinfo);
 
 	if(*jpegSubsamp<0)
-		_throw("tjDecompressHeader2(): Could not determine subsampling type for JPEG image");
+		_throw("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
+	if(*jpegColorspace<0)
+		_throw("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
 	if(*width<1 || *height<1)
-		_throw("tjDecompressHeader2(): Invalid data returned in header");
+		_throw("tjDecompressHeader3(): Invalid data returned in header");
 
 	bailout:
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
+	int *jpegSubsamp)
+{
+	int jpegColorspace;
+	return tjDecompressHeader3(handle, jpegBuf, jpegSize, width, height,
+		jpegSubsamp, &jpegColorspace);
+}
+
 DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
 	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height)
 {
@@ -973,7 +1028,7 @@
 		scaledw=TJSCALED(jpegwidth, sf[i]);
 		scaledh=TJSCALED(jpegheight, sf[i]);
 		if(scaledw<=width && scaledh<=height)
-				break;
+			break;
 	}
 	if(scaledw>width || scaledh>height)
 		_throw("tjDecompress2(): Could not scale down to desired image dimensions");
@@ -1039,26 +1094,29 @@
 }
 
 
-DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
 	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-	int flags)
+	int width, int pad, int height, int flags)
 {
-	int i, row, retval=0;  JSAMPROW *outbuf[MAX_COMPONENTS];
+	int i, sfi, row, retval=0;  JSAMPROW *outbuf[MAX_COMPONENTS];
+	int jpegwidth, jpegheight, jpegSubsamp, scaledw, scaledh;
 	int cw[MAX_COMPONENTS], ch[MAX_COMPONENTS], iw[MAX_COMPONENTS],
 		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
 	JSAMPLE *_tmpbuf=NULL, *ptr=dstBuf;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
+	int dctsize;
 
 	getinstance(handle);
 	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressToYUV(): Instance has not been initialized for decompression");
+		_throw("tjDecompressToYUV2(): Instance has not been initialized for decompression");
 
 	for(i=0; i<MAX_COMPONENTS; i++)
 	{
 		tmpbuf[i]=NULL;  outbuf[i]=NULL;
 	}
 
-	if(jpegBuf==NULL || jpegSize<=0 || dstBuf==NULL)
-		_throw("tjDecompressToYUV(): Invalid argument");
+	if(jpegBuf==NULL || jpegSize<=0 || dstBuf==NULL || width<0 || pad<1
+		|| !isPow2(pad) || height<0)
+		_throw("tjDecompressToYUV2(): Invalid argument");
 
 	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
 	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
@@ -1073,37 +1131,63 @@
 
 	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
 	jpeg_read_header(dinfo, TRUE);
+	jpegSubsamp=getSubsamp(dinfo);
+	if(jpegSubsamp<0)
+		_throw("tjTransform(): Could not determine subsampling type for JPEG image");
+
+	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
+	if(width==0) width=jpegwidth;
+	if(height==0) height=jpegheight;
+	for(i=0; i<NUMSF; i++)
+	{
+		scaledw=TJSCALED(jpegwidth, sf[i]);
+		scaledh=TJSCALED(jpegheight, sf[i]);
+		if(scaledw<=width && scaledh<=height)
+			break;
+	}
+	if(scaledw>width || scaledh>height)
+		_throw("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
+	if(dinfo->num_components>3)
+		_throw("tjDecompressToYUV2(): JPEG image must have 3 or fewer components");
+
+	width=scaledw;  height=scaledh;
+	dinfo->scale_num=sf[i].num;
+	dinfo->scale_denom=sf[i].denom;
+	sfi=i;
+	jpeg_calc_output_dimensions(dinfo);
+
+	dctsize=DCTSIZE*sf[sfi].num/sf[sfi].denom;
 
 	for(i=0; i<dinfo->num_components; i++)
 	{
 		jpeg_component_info *compptr=&dinfo->comp_info[i];
 		int ih;
-		iw[i]=compptr->width_in_blocks*DCTSIZE;
-		ih=compptr->height_in_blocks*DCTSIZE;
-		cw[i]=PAD(dinfo->image_width, dinfo->max_h_samp_factor)
+		iw[i]=compptr->width_in_blocks*dctsize;
+		ih=compptr->height_in_blocks*dctsize;
+		cw[i]=PAD(dinfo->output_width, dinfo->max_h_samp_factor)
 			*compptr->h_samp_factor/dinfo->max_h_samp_factor;
-		ch[i]=PAD(dinfo->image_height, dinfo->max_v_samp_factor)
+		ch[i]=PAD(dinfo->output_height, dinfo->max_v_samp_factor)
 			*compptr->v_samp_factor/dinfo->max_v_samp_factor;
 		if(iw[i]!=cw[i] || ih!=ch[i]) usetmpbuf=1;
-		th[i]=compptr->v_samp_factor*DCTSIZE;
+		th[i]=compptr->v_samp_factor*dctsize;
 		tmpbufsize+=iw[i]*th[i];
 		if((outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ch[i]))==NULL)
-			_throw("tjDecompressToYUV(): Memory allocation failure");
+			_throw("tjDecompressToYUV2(): Memory allocation failure");
 		for(row=0; row<ch[i]; row++)
 		{
 			outbuf[i][row]=ptr;
-			ptr+=PAD(cw[i], 4);
+			ptr+=PAD(cw[i], pad);
 		}
 	}
 	if(usetmpbuf)
 	{
 		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
-			_throw("tjDecompressToYUV(): Memory allocation failure");
+			_throw("tjDecompressToYUV2(): Memory allocation failure");
 		ptr=_tmpbuf;
 		for(i=0; i<dinfo->num_components; i++)
 		{
 			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
-				_throw("tjDecompressToYUV(): Memory allocation failure");
+				_throw("tjDecompressToYUV2(): Memory allocation failure");
 			for(row=0; row<th[i]; row++)
 			{
 				tmpbuf[i][row]=ptr;
@@ -1118,18 +1202,37 @@
 
 	jpeg_start_decompress(dinfo);
 	for(row=0; row<(int)dinfo->output_height;
-		row+=dinfo->max_v_samp_factor*DCTSIZE)
+		row+=dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size)
 	{
 		JSAMPARRAY yuvptr[MAX_COMPONENTS];
 		int crow[MAX_COMPONENTS];
 		for(i=0; i<dinfo->num_components; i++)
 		{
 			jpeg_component_info *compptr=&dinfo->comp_info[i];
+			if(jpegSubsamp==TJ_420)
+			{
+				/* When 4:2:0 subsampling is used with IDCT scaling, libjpeg will try
+				   to be clever and use the IDCT to perform upsampling on the U and V
+				   planes.  For instance, if the output image is to be scaled by 1/2
+				   relative to the JPEG image, then the scaling factor and upsampling
+				   effectively cancel each other, so a normal 8x8 IDCT can be used.
+				   However, this is not desirable when using the decompress-to-YUV
+				   functionality in TurboJPEG, since we want to output the U and V
+				   planes in their subsampled form.  Thus, we have to override some
+				   internal libjpeg parameters to force it to use the "scaled" IDCT
+				   functions on the U and V planes. */
+				compptr->_DCT_scaled_size=dctsize;
+				compptr->MCU_sample_width=tjMCUWidth[jpegSubsamp]*
+					sf[sfi].num/sf[sfi].denom*
+					compptr->v_samp_factor/dinfo->max_v_samp_factor;
+				dinfo->idct->inverse_DCT[i] = dinfo->idct->inverse_DCT[0];
+			}
 			crow[i]=row*compptr->v_samp_factor/dinfo->max_v_samp_factor;
 			if(usetmpbuf) yuvptr[i]=tmpbuf[i];
 			else yuvptr[i]=&outbuf[i][crow[i]];
 		}
-		jpeg_read_raw_data(dinfo, yuvptr, dinfo->max_v_samp_factor*DCTSIZE);
+		jpeg_read_raw_data(dinfo, yuvptr,
+			dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size);
 		if(usetmpbuf)
 		{
 			int j;
@@ -1155,6 +1258,13 @@
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+	int flags)
+{
+	return tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, 0, 4, 0, flags);
+}
+
 
 /* Transformer */
 
diff --git a/turbojpeg.h b/turbojpeg.h
index 02f3a83..77019e0 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -49,16 +49,20 @@
 /**
  * The number of chrominance subsampling options
  */
-#define TJ_NUMSAMP 5
+#define TJ_NUMSAMP 6
 
 /**
  * Chrominance subsampling options.
- * When an image is converted from the RGB to the YUV colorspace as part of
- * the JPEG compression process, some of the U and V (chrominance) components
- * can be discarded or averaged together to produce a smaller image with little
- * perceptible loss of image clarity (the human eye is more sensitive to small
- * changes in brightness than small changes in color.)  This is called
- * "chrominance subsampling".
+ * When pixels are converted from RGB to YCbCr (see #TJCS_YCbCr) or from CMYK
+ * to YCCK (see #TJCS_YCCK) as part of the JPEG compression process, some of
+ * the Cb and Cr (chrominance) components can be discarded or averaged together
+ * to produce a smaller image with little perceptible loss of image clarity
+ * (the human eye is more sensitive to small changes in brightness than to
+ * small changes in color.)  This is called "chrominance subsampling".
+ * <p>
+ * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
+ * convention of the digital video community, the TurboJPEG API uses "YUV" to
+ * refer to an image format consisting of Y, Cb, and Cr image planes.
  */
 enum TJSAMP
 {
@@ -87,7 +91,18 @@
    * chrominance component for every 1x2 block of pixels in the source image.
    * Note that 4:4:0 subsampling is not fully accelerated in libjpeg-turbo.
    */
-  TJSAMP_440
+  TJSAMP_440,
+  /**
+   * 4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 4x1 block of pixels in the source image.
+   * JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+   * horizontal features.  Note that 4:1:1 subsampling is not fully accelerated
+   * in libjpeg-turbo.
+   */
+  TJSAMP_411
 };
 
 /**
@@ -96,9 +111,10 @@
  * - 8x8 for no subsampling or grayscale
  * - 16x8 for 4:2:2
  * - 8x16 for 4:4:0
- * - 16x16 for 4:2:0 
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
  */
-static const int tjMCUWidth[TJ_NUMSAMP]  = {8, 16, 16, 8, 8};
+static const int tjMCUWidth[TJ_NUMSAMP]  = {8, 16, 16, 8, 8, 32};
 
 /**
  * MCU block height (in pixels) for a given level of chrominance subsampling.
@@ -106,15 +122,16 @@
  * - 8x8 for no subsampling or grayscale
  * - 16x8 for 4:2:2
  * - 8x16 for 4:4:0
- * - 16x16 for 4:2:0 
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
  */
-static const int tjMCUHeight[TJ_NUMSAMP] = {8, 8, 16, 8, 16};
+static const int tjMCUHeight[TJ_NUMSAMP] = {8, 8, 16, 8, 16, 8};
 
 
 /**
  * The number of pixel formats
  */
-#define TJ_NUMPF 11
+#define TJ_NUMPF 12
 
 /**
  * Pixel formats
@@ -189,16 +206,33 @@
    * decompressing, the X component is guaranteed to be 0xFF, which can be
    * interpreted as an opaque alpha channel.
    */
-  TJPF_ARGB
+  TJPF_ARGB,
+  /**
+   * CMYK pixel format.  Unlike RGB, which is an additive color model used
+   * primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+   * color model used primarily for printing.  In the CMYK color model, the
+   * value of each color component typically corresponds to an amount of cyan,
+   * magenta, yellow, or black ink that is applied to a white background.  In
+   * order to convert between CMYK and RGB, it is necessary to use a color
+   * management system (CMS.)  A CMS will attempt to map colors within the
+   * printer's gamut to perceptually similar colors in the display's gamut and
+   * vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+   * be defined with a simple formula.  Thus, such a conversion is out of scope
+   * for a codec library.  However, the TurboJPEG API allows for compressing
+   * CMYK pixels into a YCCK JPEG image (see #TJCS_YCCK) and decompressing YCCK
+   * JPEG images into CMYK pixels.
+   */
+  TJPF_CMYK
 };
 
+
 /**
  * Red offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the red component is offset from the start of the pixel.  For
  * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
  * then the red component will be <tt>pixel[tjRedOffset[TJ_BGRX]]</tt>.
  */
-static const int tjRedOffset[TJ_NUMPF] = {0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1};
+static const int tjRedOffset[TJ_NUMPF] = {0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1, -1};
 /**
  * Green offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the green component is offset from the start of the pixel.
@@ -206,19 +240,81 @@
  * <tt>char pixel[]</tt>, then the green component will be
  * <tt>pixel[tjGreenOffset[TJ_BGRX]]</tt>.
  */
-static const int tjGreenOffset[TJ_NUMPF] = {1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2};
+static const int tjGreenOffset[TJ_NUMPF] = {1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2, -1};
 /**
  * Blue offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the Blue component is offset from the start of the pixel.  For
  * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
  * then the blue component will be <tt>pixel[tjBlueOffset[TJ_BGRX]]</tt>.
  */
-static const int tjBlueOffset[TJ_NUMPF] = {2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3};
+static const int tjBlueOffset[TJ_NUMPF] = {2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3, -1};
 
 /**
  * Pixel size (in bytes) for a given pixel format.
  */
-static const int tjPixelSize[TJ_NUMPF] = {3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4};
+static const int tjPixelSize[TJ_NUMPF] = {3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4};
+
+
+/**
+ * The number of JPEG colorspaces
+ */
+#define TJ_NUMCS 5
+
+/**
+ * JPEG colorspaces
+ */
+enum TJCS
+{
+  /**
+   * RGB colorspace.  When compressing the JPEG image, the R, G, and B
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  RGB JPEG images can be
+   * decompressed to any of the extended RGB pixel formats or grayscale, but
+   * they cannot be decompressed to YUV images.
+   */
+  TJCS_RGB=0,
+  /**
+   * YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+   * mathematical transformation of RGB designed solely for storage and
+   * transmission.  YCbCr images must be converted to RGB before they can
+   * actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+   * component represents the black & white portion of the original image, and
+   * the Cb and Cr (chrominance) components represent the color portion of the
+   * original image.  Originally, the analog equivalent of this transformation
+   * allowed the same signal to drive both black & white and color televisions,
+   * but JPEG images use YCbCr primarily because it allows the color data to be
+   * optionally subsampled for the purposes of reducing bandwidth or disk
+   * space.  YCbCr is the most common JPEG colorspace, and YCbCr JPEG images
+   * can be compressed from and decompressed to any of the extended RGB pixel
+   * formats or grayscale, or they can be decompressed to YUV planar images.
+   */
+  TJCS_YCbCr,
+  /**
+   * Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+   * component), and any color data from the source image is discarded.
+   * Grayscale JPEG images can be compressed from and decompressed to any of
+   * the extended RGB pixel formats or grayscale, or they can be decompressed
+   * to YUV planar images.
+   */
+  TJCS_GRAY,
+  /**
+   * CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  CMYK JPEG images can
+   * only be decompressed to CMYK pixels.
+   */
+  TJCS_CMYK,
+  /**
+   * YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+   * rather a mathematical transformation of CMYK designed solely for storage
+   * and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+   * reversibly transformed into YCCK, and as with YCbCr, the chrominance
+   * components in the YCCK pixels can be subsampled without incurring major
+   * perceptual loss.  YCCK JPEG images can only be compressed from and
+   * decompressed to CMYK pixels.
+   */
+  TJCS_YCCK
+};
 
 
 /**
@@ -441,8 +537,8 @@
   /**
    * A callback function that can be used to modify the DCT coefficients
    * after they are losslessly transformed but before they are transcoded to a
-   * new JPEG file.  This allows for custom filters or other transformations to
-   * be applied in the frequency domain.
+   * new JPEG image.  This allows for custom filters or other transformations
+   * to be applied in the frequency domain.
    *
    * @param coeffs pointer to an array of transformed DCT coefficients.  (NOTE:
    *        this pointer is not guaranteed to be valid once the callback
@@ -508,11 +604,11 @@
 
 
 /**
- * Compress an RGB or grayscale image into a JPEG image.
+ * Compress an RGB, grayscale, or CMYK image into a JPEG image.
  *
  * @param handle a handle to a TurboJPEG compressor or transformer instance
- * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
- *        to be compressed
+ * @param srcBuf pointer to an image buffer containing RGB, grayscale, or
+ *        CMYK pixels to be compressed
  * @param width width (in pixels) of the source image
  * @param pitch bytes per line of the source image.  Normally, this should be
  *        <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded,
@@ -588,6 +684,8 @@
  * the given parameters.
  *
  * @param width width of the image (in pixels)
+ * @param pad the width of each line in each plane of the image is padded to
+ *        the nearest multiple of this number of bytes (must be a power of 2.)
  * @param height height of the image (in pixels)
  * @param subsamp level of chrominance subsampling in the image (see
  *        @ref TJSAMP "Chrominance subsampling options".)
@@ -595,22 +693,26 @@
  * @return the size of the buffer (in bytes) required to hold the image, or
  * -1 if the arguments are out of bounds.
  */
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
   int subsamp);
 
 
 /**
  * Encode an RGB or grayscale image into a YUV planar image.  This function
  * uses the accelerated color conversion routines in TurboJPEG's underlying
- * codec to produce a planar YUV image that is suitable for X Video.
- * Specifically, if the chrominance components are subsampled along the
- * horizontal dimension, then the width of the luminance plane is padded to the
- * nearest multiple of 2 in the output image (same goes for the height of the
- * luminance plane, if the chrominance components are subsampled along the
- * vertical dimension.)  Also, each line of each plane in the output image is
- * padded to 4 bytes.  Although this will work with any subsampling option, it
- * is really only useful in combination with TJ_420, which produces an image
- * compatible with the I420 (AKA "YUV420P") format.
+ * codec but does not execute any of the other steps in the JPEG compression
+ * process.  The Y, U (Cb), and V (Cr) image planes are stored sequentially
+ * into the destination buffer, and the size of each plane is determined by the
+ * width and height of the source image, as well as the specified padding and
+ * level of chrominance subsampling.  If the chrominance components are
+ * subsampled along the horizontal dimension, then the width of the luminance
+ * plane is padded to the nearest multiple of 2 in the output image (same goes
+ * for the height of the luminance plane, if the chrominance components are
+ * subsampled along the vertical dimension.)
+ * <p>
+ * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
+ * convention of the digital video community, the TurboJPEG API uses "YUV" to
+ * refer to an image format consisting of Y, Cb, and Cr image planes.
  *
  * @param handle a handle to a TurboJPEG compressor or transformer instance
  * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
@@ -627,20 +729,26 @@
  * @param pixelFormat pixel format of the source image (see @ref TJPF
  *        "Pixel formats".)
  * @param dstBuf pointer to an image buffer that will receive the YUV image.
- *        Use #tjBufSizeYUV() to determine the appropriate size for this buffer
- *        based on the image width, height, and level of chrominance
- *        subsampling.
+ *        Use #tjBufSizeYUV2() to determine the appropriate size for this
+ *        buffer based on the image width, height, padding, and level of
+ *        chrominance subsampling.
+ * @param pad the width of each line in each plane of the YUV image will be
+ *        padded to the nearest multiple of this number of bytes (must be a
+ *        power of 2.)  To generate images suitable for X Video, <tt>pad</tt>
+ *        should be set to 4.
  * @param subsamp the level of chrominance subsampling to be used when
  *        generating the YUV image (see @ref TJSAMP
- *        "Chrominance subsampling options".)
+ *        "Chrominance subsampling options".)  To generate images suitable for
+ *        X Video, <tt>subsamp</tt> should be set to @ref TJSAMP_420.  This
+ *        produces an image compatible with the I420 (AKA "YUV420P") format.
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
  *        "flags".
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle,
+DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle,
   unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
-  unsigned char *dstBuf, int subsamp, int flags);
+  unsigned char *dstBuf, int pad, int subsamp, int flags);
 
 
 /**
@@ -665,12 +773,15 @@
  * @param jpegSubsamp pointer to an integer variable that will receive the
  *        level of chrominance subsampling used when compressing the JPEG image
  *        (see @ref TJSAMP "Chrominance subsampling options".)
+ * @param jpegColorspace pointer to an integer variable that will receive one
+ *        of the JPEG colorspace constants, indicating the colorspace of the
+ *        JPEG image (see @ref TJCS "JPEG colorspaces".)
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-  int *jpegSubsamp);
+  int *jpegSubsamp, int *jpegColorspace);
 
 
 /**
@@ -687,7 +798,7 @@
 
 
 /**
- * Decompress a JPEG image to an RGB or grayscale image.
+ * Decompress a JPEG image to an RGB, grayscale, or CMYK image.
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
  * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
@@ -714,8 +825,8 @@
  *        calling #TJSCALED() with the JPEG image width and one of the scaling
  *        factors returned by #tjGetScalingFactors().)  You can also be clever
  *        and use the pitch parameter to skip lines, etc.  Setting this
- *        parameter to 0 is the equivalent of setting it to <tt>scaledWidth
- *        * #tjPixelSize[pixelFormat]</tt>.
+ *        parameter to 0 is the equivalent of setting it to
+ *        <tt>scaledWidth * #tjPixelSize[pixelFormat]</tt>.
  * @param height desired height (in pixels) of the destination image.  If this
  *        is different than the height of the JPEG image being decompressed,
  *        then TurboJPEG will use scaling in the JPEG decompressor to generate
@@ -737,26 +848,47 @@
 /**
  * Decompress a JPEG image to a YUV planar image.  This function performs JPEG
  * decompression but leaves out the color conversion step, so a planar YUV
- * image is generated instead of an RGB image.  The padding of the planes in
- * this image is the same as in the images generated by #tjEncodeYUV2().  Note
- * that, if the width or height of the image is not an even multiple of the MCU
- * block size (see #tjMCUWidth and #tjMCUHeight), then an intermediate buffer
- * copy will be performed within TurboJPEG.
+ * image is generated instead of an RGB image.  The structure of the planes in
+ * this image is the same as in the images generated by #tjEncodeYUV3().  Note
+ * that, if the width or height of the JPEG image is not an even multiple of
+ * the MCU block size (see #tjMCUWidth and #tjMCUHeight), then an intermediate
+ * buffer copy will be performed within TurboJPEG.
+ * <p>
+ * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
+ * convention of the digital video community, the TurboJPEG API uses "YUV" to
+ * refer to an image format consisting of Y, Cb, and Cr image planes.
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
  * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
  * @param jpegSize size of the JPEG image (in bytes)
  * @param dstBuf pointer to an image buffer that will receive the YUV image.
- *        Use #tjBufSizeYUV() to determine the appropriate size for this buffer
- *        based on the image width, height, and level of subsampling.
+ *        Use #tjBufSizeYUV2() to determine the appropriate size for this
+ *        buffer based on the image width, height, padding, and level of
+ *        subsampling.
+ * @param width desired width (in pixels) of the YUV image.  If this is
+ *        different than the width of the JPEG image being decompressed, then
+ *        TurboJPEG will use scaling in the JPEG decompressor to generate the
+ *        largest possible image that will fit within the desired width.  If
+ *        <tt>width</tt> is set to 0, then only the height will be considered
+ *        when determining the scaled image size.
+ * @param pad the width of each line in each plane of the YUV image will be
+ *        padded to the nearest multiple of this number of bytes (must be a
+ *        power of 2.)  To generate images suitable for X Video, <tt>pad</tt>
+ *        should be set to 4.
+ * @param height desired height (in pixels) of the YUV image.  If this is
+ *        different than the height of the JPEG image being decompressed, then
+ *        TurboJPEG will use scaling in the JPEG decompressor to generate the
+ *        largest possible image that will fit within the desired height.  If
+ *        <tt>height</tt> is set to 0, then only the width will be considered
+ *        when determining the scaled image size.
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
  *        "flags".
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
-DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-  int flags);
+  int width, int pad, int height, int flags);
 
 
 /**
@@ -893,6 +1025,9 @@
 DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
   int jpegSubsamp);
 
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+  int subsamp);
+
 DLLEXPORT int DLLCALL tjCompress(tjhandle handle, unsigned char *srcBuf,
   int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
   unsigned long *compressedSize, int jpegSubsamp, int jpegQual, int flags);
@@ -901,13 +1036,25 @@
   unsigned char *srcBuf, int width, int pitch, int height, int pixelSize,
   unsigned char *dstBuf, int subsamp, int flags);
 
+DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle,
+  unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
+  unsigned char *dstBuf, int subsamp, int flags);
+
 DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height);
 
+DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
+  int *jpegSubsamp);
+
 DLLEXPORT int DLLCALL tjDecompress(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
   int width, int pitch, int height, int pixelSize, int flags);
 
+DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  int flags);
+
 
 /**
  * @}