Silence warning in CMake 2.8.12 and later


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1151 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d3c0972..d80933e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@
 cmake_policy(SET CMP0022 OLD)
 
 project(libjpeg-turbo C)
-set(VERSION 1.3.1)
+set(VERSION 1.3.80)
 
 if(MINGW OR CYGWIN)
   execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
@@ -316,12 +316,16 @@
 if(WITH_JAVA)
 add_test(TJUnitTest ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest)
 add_test(TJUnitTest-yuv ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -yuv)
+add_test(TJUnitTest-yuv-nopad ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -yuv -noyuvpad)
 add_test(TJUnitTest-bi ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -bi)
 add_test(TJUnitTest-bi-yuv ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -bi -yuv)
+add_test(TJUnitTest-bi-yuv-nopad ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR} TJUnitTest -bi -yuv -noyuvpad)
 endif()
 add_test(tjunittest tjunittest)
 add_test(tjunittest-alloc tjunittest -alloc)
 add_test(tjunittest-yuv tjunittest -yuv)
+add_test(tjunittest-yuv-alloc tjunittest -yuv -alloc)
+add_test(tjunittest-yuv-nopad tjunittest -yuv -noyuvpad)
 add_test(cjpeg-int sharedlib/cjpeg -dct int -outfile testoutint.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
 add_test(cjpeg-int-cmp ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_INT} -DFILE=testoutint.jpg -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
 add_test(cjpeg-fast sharedlib/cjpeg -dct fast -opt -outfile testoutfst.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
@@ -379,6 +383,8 @@
 add_test(tjunittest-static tjunittest-static)
 add_test(tjunittest-static-alloc tjunittest-static -alloc)
 add_test(tjunittest-static-yuv tjunittest-static -yuv)
+add_test(tjunittest-static-yuv-alloc tjunittest-static -yuv -alloc)
+add_test(tjunittest-static-yuv-nopad tjunittest-static -yuv -noyuvpad)
 add_test(cjpeg-static-int cjpeg-static -dct int -outfile testoutint.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
 add_test(cjpeg-static-int-cmp ${CMAKE_COMMAND} -DMD5=${MD5_JPEG_INT} -DFILE=testoutint.jpg -P ${CMAKE_SOURCE_DIR}/cmakescripts/md5cmp.cmake)
 add_test(cjpeg-static-fast cjpeg-static -dct fast -opt -outfile testoutfst.jpg ${CMAKE_SOURCE_DIR}/testimages/testorig.ppm)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index df0041e..5c83016 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,3 +1,38 @@
+1.4 pre-beta
+============
+
+[1] The TurboJPEG API can now be used to generate YUV images with an arbitrary
+line padding (previously, it only supported 4-byte padding, which was
+compatible with X Video.)  Also, the decompress-to-YUV function has been
+extended to support image scaling.
+
+[2] Added SIMD acceleration for DSPr2-capable MIPS platforms.  This speeds up
+the compression of full-color JPEGs by 70-80% on such platforms and
+decompression by 25-35%.
+
+[3] Added support for 4:1:1 subsampling to the TurboJPEG API.  This is mainly
+included for compatibility, since 4:1:1 is not fully accelerated in
+libjpeg-turbo and has no significant advantages relative to 4:2:0.
+
+[4] Added support for CMYK images to the TurboJPEG API.  This feature allows
+CMYK source images to be compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be
+decompressed to CMYK destination images.  Conversion between CMYK and RGB
+images is not supported.  Such conversion requires a color management system
+and is out of scope for a codec library.
+
+[5] The TurboJPEG API can now be used to compress JPEG images from YUV planar
+source images and to decode YUV planar images into RGB, grayscale, or extended
+RGB images.
+
+[6] If an application attempts to decompress a Huffman-coded JPEG image whose
+header does not contain Huffman tables, libjpeg-turbo will now insert the
+default Huffman tables.  In order to save space, many motion JPEG video frames
+are encoded without the default Huffman tables, so these frames can now be
+successfully decompressed by libjpeg-turbo without additional work on the part
+of the application.  An application can still override the Huffman tables, for
+instance to re-use tables from a previous frame of the same video.
+
+
 1.3.1
 =====
 
diff --git a/Makefile.am b/Makefile.am
index 79594bf..2a68a96 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -4,7 +4,7 @@
 
 if WITH_TURBOJPEG
 lib_LTLIBRARIES += libturbojpeg.la
-libturbojpeg_la_LDFLAGS = -version-info 0:0 -no-undefined
+libturbojpeg_la_LDFLAGS = -version-info 1:0:1 -no-undefined
 include_HEADERS += turbojpeg.h
 endif
 
@@ -201,11 +201,15 @@
 	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest
 	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -bi
 	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -noyuvpad
 	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -bi
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -bi -noyuvpad
 endif
 	./tjunittest
 	./tjunittest -alloc
 	./tjunittest -yuv
+	./tjunittest -yuv -alloc
+	./tjunittest -yuv -noyuvpad
 endif
 	./cjpeg -dct int -outfile testoutint.jpg $(srcdir)/testimages/testorig.ppm
 	md5/md5cmp $(MD5_JPEG_INT) testoutint.jpg
@@ -314,6 +318,11 @@
 	rm -f *_440_*.ppm
 	rm -f *_440_*.jpg
 	rm -f *_440.yuv
+	rm -f *_411_*.bmp
+	rm -f *_411_*.png
+	rm -f *_411_*.ppm
+	rm -f *_411_*.jpg
+	rm -f *_411.yuv
 
 
 tjtest:
diff --git a/acinclude.m4 b/acinclude.m4
index afb4359..40a9e52 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -180,3 +180,66 @@
     $2
   fi
 ])
+
+# AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE
+# --------------------------
+# Test whether the assembler is suitable and supports MIPS instructions
+AC_DEFUN([AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE],[
+  have_mips_dspr2=no
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="$CCASFLAGS -mdspr2"
+
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+
+  int main ()
+  {
+    int c = 0, a = 0, b = 0;
+    __asm__ __volatile__ (
+        "precr.qb.ph %[c], %[a], %[b]          \n\t"
+        : [c] "=r" (c)
+        : [a] "r" (a), [b] "r" (b)
+    );
+    return c;
+  }
+  ]])], have_mips_dspr2=yes)
+  CFLAGS=$ac_save_CFLAGS
+
+  if test "x$have_mips_dspr2" = "xyes" ; then
+    $1
+  else
+    $2
+  fi
+])
+
+AC_DEFUN([AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE],[
+  ac_good_gnu_arm_assembler=no
+  ac_save_CC="$CC"
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="$CCASFLAGS -x assembler-with-cpp"
+  CC="$CCAS"
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+    .text
+    movi v0.16b, #100]])], ac_good_gnu_arm_assembler=yes)
+
+  ac_use_gas_preprocessor=no
+  if test "x$ac_good_gnu_arm_assembler" = "xno" ; then
+    CC="gas-preprocessor.pl $CCAS"
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+      .text
+      movi v0.16b, #100]])], ac_use_gas_preprocessor=yes)
+  fi
+  CFLAGS="$ac_save_CFLAGS"
+  CC="$ac_save_CC"
+
+  if test "x$ac_use_gas_preprocessor" = "xyes" ; then
+    CCAS="gas-preprocessor.pl $CCAS"
+    AC_SUBST([CCAS])
+    ac_good_gnu_arm_assembler=yes
+  fi
+
+  if test "x$ac_good_gnu_arm_assembler" = "xyes" ; then
+    $1
+  else
+    $2
+  fi
+])
diff --git a/configure.ac b/configure.ac
index 34eea29..d0780c9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.56])
-AC_INIT([libjpeg-turbo], [1.3.1])
+AC_INIT([libjpeg-turbo], [1.3.80])
 BUILD=`date +%Y%m%d`
 
 AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
@@ -435,6 +435,38 @@
         fi
       fi
       ;;
+    aarch64*)
+      AC_MSG_RESULT([yes (arm64)])
+      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
+      AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE(
+        [AC_MSG_RESULT([yes])
+         simd_arch=aarch64],
+        [AC_MSG_RESULT([no])
+         with_simd=no])
+      if test "x${with_simd}" = "xno"; then
+        if test "x${require_simd}" = "xyes"; then
+          AC_MSG_ERROR([SIMD support can't be enabled.])
+        else
+          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])
+        fi
+      fi
+      ;;
+    mipsel*)
+      AC_MSG_RESULT([yes (mipsel)])
+      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
+      AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE(
+        [AC_MSG_RESULT([yes])
+         simd_arch=mipsel],
+        [AC_MSG_RESULT([no])
+         with_simd=no])
+      if test "x${with_simd}" = "xno"; then
+        if test "x${require_simd}" = "xyes"; then
+          AC_MSG_ERROR([SIMD support can't be enabled.])
+        else
+          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])
+        fi
+      fi
+      ;;
     *)
       AC_MSG_RESULT([no ("$host_cpu")])
       with_simd=no;
@@ -458,6 +490,8 @@
 AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
 AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
 AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
+AM_CONDITIONAL([SIMD_ARM_64], [test "x$simd_arch" = "xaarch64"])
+AM_CONDITIONAL([SIMD_MIPSEL], [test "x$simd_arch" = "xmipsel"])
 AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
 AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
 
diff --git a/doc/html/annotated.html b/doc/html/annotated.html
index 1e3fbd0..f928720 100644
--- a/doc/html/annotated.html
+++ b/doc/html/annotated.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/classes.html b/doc/html/classes.html
index 4722b14..ad625f1 100644
--- a/doc/html/classes.html
+++ b/doc/html/classes.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/functions.html b/doc/html/functions.html
index 7af0d8e..55ccba0 100644
--- a/doc/html/functions.html
+++ b/doc/html/functions.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/functions_vars.html b/doc/html/functions_vars.html
index e6a6f72..cdc5560 100644
--- a/doc/html/functions_vars.html
+++ b/doc/html/functions_vars.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/group___turbo_j_p_e_g.html b/doc/html/group___turbo_j_p_e_g.html
index b0ab027..28e4926 100644
--- a/doc/html/group___turbo_j_p_e_g.html
+++ b/doc/html/group___turbo_j_p_e_g.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
@@ -109,21 +109,12 @@
 <tr class="memitem:ga7010a4402f54a45ba822ad8675a4655e"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e">TJ_NUMPF</a></td></tr>
 <tr class="memdesc:ga7010a4402f54a45ba822ad8675a4655e"><td class="mdescLeft">&#160;</td><td class="mdescRight">The number of pixel formats.  <a href="#ga7010a4402f54a45ba822ad8675a4655e">More...</a><br/></td></tr>
 <tr class="separator:ga7010a4402f54a45ba822ad8675a4655e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga39f57a6fb02d9cf32e7b6890099b5a71"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga39f57a6fb02d9cf32e7b6890099b5a71">TJ_NUMCS</a></td></tr>
+<tr class="memdesc:ga39f57a6fb02d9cf32e7b6890099b5a71"><td class="mdescLeft">&#160;</td><td class="mdescRight">The number of JPEG colorspaces.  <a href="#ga39f57a6fb02d9cf32e7b6890099b5a71">More...</a><br/></td></tr>
+<tr class="separator:ga39f57a6fb02d9cf32e7b6890099b5a71"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga72ecf4ebe6eb702d3c6f5ca27455e1ec"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">TJFLAG_BOTTOMUP</a></td></tr>
 <tr class="memdesc:ga72ecf4ebe6eb702d3c6f5ca27455e1ec"><td class="mdescLeft">&#160;</td><td class="mdescRight">The uncompressed source/destination image is stored in bottom-up (Windows, OpenGL) order, not top-down (X11) order.  <a href="#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">More...</a><br/></td></tr>
 <tr class="separator:ga72ecf4ebe6eb702d3c6f5ca27455e1ec"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga4e872f11c82f241736fa8297920f24e5"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga4e872f11c82f241736fa8297920f24e5">TJFLAG_FORCEMMX</a></td></tr>
-<tr class="memdesc:ga4e872f11c82f241736fa8297920f24e5"><td class="mdescLeft">&#160;</td><td class="mdescRight">Turn off CPU auto-detection and force TurboJPEG to use MMX code (if the underlying codec supports it.)  <a href="#ga4e872f11c82f241736fa8297920f24e5">More...</a><br/></td></tr>
-<tr class="separator:ga4e872f11c82f241736fa8297920f24e5"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gae17e63189e8cd730feed3efbd2454f38"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gae17e63189e8cd730feed3efbd2454f38">TJFLAG_FORCESSE</a></td></tr>
-<tr class="memdesc:gae17e63189e8cd730feed3efbd2454f38"><td class="mdescLeft">&#160;</td><td class="mdescRight">Turn off CPU auto-detection and force TurboJPEG to use SSE code (if the underlying codec supports it.)  <a href="#gae17e63189e8cd730feed3efbd2454f38">More...</a><br/></td></tr>
-<tr class="separator:gae17e63189e8cd730feed3efbd2454f38"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga8cf0bca96ea4d472563f4b0ebf8c48e7"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga8cf0bca96ea4d472563f4b0ebf8c48e7">TJFLAG_FORCESSE2</a></td></tr>
-<tr class="memdesc:ga8cf0bca96ea4d472563f4b0ebf8c48e7"><td class="mdescLeft">&#160;</td><td class="mdescRight">Turn off CPU auto-detection and force TurboJPEG to use SSE2 code (if the underlying codec supports it.)  <a href="#ga8cf0bca96ea4d472563f4b0ebf8c48e7">More...</a><br/></td></tr>
-<tr class="separator:ga8cf0bca96ea4d472563f4b0ebf8c48e7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gaf9d49066633404da4386d70820295dd2"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaf9d49066633404da4386d70820295dd2">TJFLAG_FORCESSE3</a></td></tr>
-<tr class="memdesc:gaf9d49066633404da4386d70820295dd2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Turn off CPU auto-detection and force TurboJPEG to use SSE3 code (if the underlying codec supports it.)  <a href="#gaf9d49066633404da4386d70820295dd2">More...</a><br/></td></tr>
-<tr class="separator:gaf9d49066633404da4386d70820295dd2"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga4ee4506c81177a06f77e2504a22efd2d"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga4ee4506c81177a06f77e2504a22efd2d">TJFLAG_FASTUPSAMPLE</a></td></tr>
 <tr class="memdesc:ga4ee4506c81177a06f77e2504a22efd2d"><td class="mdescLeft">&#160;</td><td class="mdescRight">When decompressing an image that was compressed using chrominance subsampling, use the fastest chrominance upsampling algorithm available in the underlying codec.  <a href="#ga4ee4506c81177a06f77e2504a22efd2d">More...</a><br/></td></tr>
 <tr class="separator:ga4ee4506c81177a06f77e2504a22efd2d"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -178,7 +169,8 @@
 <a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a63085dbf683cfe39e513cdb6343e3737">TJSAMP_420</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a3f1c9504842ddc7a48d0f690754b6248">TJSAMP_GRAY</a>, 
 <br/>
-&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974">TJSAMP_440</a>
+&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974">TJSAMP_440</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2">TJSAMP_411</a>
 <br/>
  }</td></tr>
 <tr class="memdesc:ga1d047060ea80bb9820d540bb928e9074"><td class="mdescLeft">&#160;</td><td class="mdescRight">Chrominance subsampling options.  <a href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">More...</a><br/></td></tr>
@@ -196,11 +188,23 @@
 <br/>
 &#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aac037ff1845cf9b74bb81a3659c2b9fb4">TJPF_BGRA</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa1ba1a7f1631dbeaa49a0a85fc4a40081">TJPF_ABGR</a>, 
-<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aae8f846ed9d9de99b6e1dfe448848765c">TJPF_ARGB</a>
+<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aae8f846ed9d9de99b6e1dfe448848765c">TJPF_ARGB</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b">TJPF_CMYK</a>
 <br/>
  }</td></tr>
 <tr class="memdesc:gac916144e26c3817ac514e64ae5d12e2a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Pixel formats.  <a href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">More...</a><br/></td></tr>
 <tr class="separator:gac916144e26c3817ac514e64ae5d12e2a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga4f83ad3368e0e29d1957be0efa7c3720"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">TJCS</a> { <br/>
+&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555">TJCS_RGB</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75">TJCS_YCbCr</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a">TJCS_GRAY</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53">TJCS_CMYK</a>, 
+<br/>
+&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e">TJCS_YCCK</a>
+<br/>
+ }</td></tr>
+<tr class="memdesc:ga4f83ad3368e0e29d1957be0efa7c3720"><td class="mdescLeft">&#160;</td><td class="mdescRight">JPEG colorspaces.  <a href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">More...</a><br/></td></tr>
+<tr class="separator:ga4f83ad3368e0e29d1957be0efa7c3720"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga2de531af4e7e6c4f124908376b354866"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866">TJXOP</a> { <br/>
 &#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aad88c0366cd3f7d0eac9d7a3fa1c2c27">TJXOP_NONE</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aa0df69776caa30f0fa28e26332d311ce">TJXOP_HFLIP</a>, 
@@ -222,32 +226,38 @@
 <tr class="memdesc:ga3d10c47fbe4a2489a2b30c931551d01a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a TurboJPEG compressor instance.  <a href="#ga3d10c47fbe4a2489a2b30c931551d01a">More...</a><br/></td></tr>
 <tr class="separator:ga3d10c47fbe4a2489a2b30c931551d01a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaba62b7a98f960839b588579898495cf2"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2">tjCompress2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags)</td></tr>
-<tr class="memdesc:gaba62b7a98f960839b588579898495cf2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress an RGB or grayscale image into a JPEG image.  <a href="#gaba62b7a98f960839b588579898495cf2">More...</a><br/></td></tr>
+<tr class="memdesc:gaba62b7a98f960839b588579898495cf2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress an RGB, grayscale, or CMYK image into a JPEG image.  <a href="#gaba62b7a98f960839b588579898495cf2">More...</a><br/></td></tr>
 <tr class="separator:gaba62b7a98f960839b588579898495cf2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga0b931126c7a615ddc3bbd0cca6698d67"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0b931126c7a615ddc3bbd0cca6698d67">tjCompressFromYUV</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pad, int height, int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)</td></tr>
+<tr class="memdesc:ga0b931126c7a615ddc3bbd0cca6698d67"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compress a YUV planar image into a JPEG image.  <a href="#ga0b931126c7a615ddc3bbd0cca6698d67">More...</a><br/></td></tr>
+<tr class="separator:ga0b931126c7a615ddc3bbd0cca6698d67"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned long DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b">tjBufSize</a> (int width, int height, int jpegSubsamp)</td></tr>
 <tr class="memdesc:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="mdescLeft">&#160;</td><td class="mdescRight">The maximum size of the buffer (in bytes) required to hold a JPEG image with the given parameters.  <a href="#gaccc5bca7f12fcdcc302e6e1c6d4b311b">More...</a><br/></td></tr>
 <tr class="separator:gaccc5bca7f12fcdcc302e6e1c6d4b311b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga9d0cb06fd5052d21b6f2b382db8b219c"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned long DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c">tjBufSizeYUV</a> (int width, int height, int subsamp)</td></tr>
-<tr class="memdesc:ga9d0cb06fd5052d21b6f2b382db8b219c"><td class="mdescLeft">&#160;</td><td class="mdescRight">The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters.  <a href="#ga9d0cb06fd5052d21b6f2b382db8b219c">More...</a><br/></td></tr>
-<tr class="separator:ga9d0cb06fd5052d21b6f2b382db8b219c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga0fa4e7b1943687c6a0c0304529c55d35"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0fa4e7b1943687c6a0c0304529c55d35">tjEncodeYUV2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf, int subsamp, int flags)</td></tr>
-<tr class="memdesc:ga0fa4e7b1943687c6a0c0304529c55d35"><td class="mdescLeft">&#160;</td><td class="mdescRight">Encode an RGB or grayscale image into a YUV planar image.  <a href="#ga0fa4e7b1943687c6a0c0304529c55d35">More...</a><br/></td></tr>
-<tr class="separator:ga0fa4e7b1943687c6a0c0304529c55d35"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gaf451664a62c1f6c7cc5a6401f32908c9"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned long DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9">tjBufSizeYUV2</a> (int width, int pad, int height, int subsamp)</td></tr>
+<tr class="memdesc:gaf451664a62c1f6c7cc5a6401f32908c9"><td class="mdescLeft">&#160;</td><td class="mdescRight">The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters.  <a href="#gaf451664a62c1f6c7cc5a6401f32908c9">More...</a><br/></td></tr>
+<tr class="separator:gaf451664a62c1f6c7cc5a6401f32908c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360">tjEncodeYUV3</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags)</td></tr>
+<tr class="memdesc:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="mdescLeft">&#160;</td><td class="mdescRight">Encode an RGB or grayscale image into a YUV planar image.  <a href="#ga0a5ffbf7cb58a5b6a8201114fe889360">More...</a><br/></td></tr>
+<tr class="separator:ga0a5ffbf7cb58a5b6a8201114fe889360"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gae5408179d041e2a2f7199c8283cf649e"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gae5408179d041e2a2f7199c8283cf649e">tjInitDecompress</a> (void)</td></tr>
 <tr class="memdesc:gae5408179d041e2a2f7199c8283cf649e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a TurboJPEG decompressor instance.  <a href="#gae5408179d041e2a2f7199c8283cf649e">More...</a><br/></td></tr>
 <tr class="separator:gae5408179d041e2a2f7199c8283cf649e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gac5675fceb7997b385516cdffdb34e6aa"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gac5675fceb7997b385516cdffdb34e6aa">tjDecompressHeader2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height, int *jpegSubsamp)</td></tr>
-<tr class="memdesc:gac5675fceb7997b385516cdffdb34e6aa"><td class="mdescLeft">&#160;</td><td class="mdescRight">Retrieve information about a JPEG image without decompressing it.  <a href="#gac5675fceb7997b385516cdffdb34e6aa">More...</a><br/></td></tr>
-<tr class="separator:gac5675fceb7997b385516cdffdb34e6aa"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gacd0fac3af74b3511d39b4781b7103086"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086">tjDecompressHeader3</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height, int *jpegSubsamp, int *jpegColorspace)</td></tr>
+<tr class="memdesc:gacd0fac3af74b3511d39b4781b7103086"><td class="mdescLeft">&#160;</td><td class="mdescRight">Retrieve information about a JPEG image without decompressing it.  <a href="#gacd0fac3af74b3511d39b4781b7103086">More...</a><br/></td></tr>
+<tr class="separator:gacd0fac3af74b3511d39b4781b7103086"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga6449044b9af402999ccf52f401333be8"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="structtjscalingfactor.html">tjscalingfactor</a> *DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8">tjGetScalingFactors</a> (int *numscalingfactors)</td></tr>
 <tr class="memdesc:ga6449044b9af402999ccf52f401333be8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of TurboJPEG supports.  <a href="#ga6449044b9af402999ccf52f401333be8">More...</a><br/></td></tr>
 <tr class="separator:ga6449044b9af402999ccf52f401333be8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gada69cc6443d1bb493b40f1626259e5e9"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9">tjDecompress2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
-<tr class="memdesc:gada69cc6443d1bb493b40f1626259e5e9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to an RGB or grayscale image.  <a href="#gada69cc6443d1bb493b40f1626259e5e9">More...</a><br/></td></tr>
+<tr class="memdesc:gada69cc6443d1bb493b40f1626259e5e9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to an RGB, grayscale, or CMYK image.  <a href="#gada69cc6443d1bb493b40f1626259e5e9">More...</a><br/></td></tr>
 <tr class="separator:gada69cc6443d1bb493b40f1626259e5e9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gad7810af095624a4016e72957a50f77d8"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gad7810af095624a4016e72957a50f77d8">tjDecompressToYUV</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int flags)</td></tr>
-<tr class="memdesc:gad7810af095624a4016e72957a50f77d8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to a YUV planar image.  <a href="#gad7810af095624a4016e72957a50f77d8">More...</a><br/></td></tr>
-<tr class="separator:gad7810af095624a4016e72957a50f77d8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07">tjDecompressToYUV2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf, int width, int pad, int height, int flags)</td></tr>
+<tr class="memdesc:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decompress a JPEG image to a YUV planar image.  <a href="#ga7c08b340ad7f8e85d407bd9e81d44d07">More...</a><br/></td></tr>
+<tr class="separator:ga7c08b340ad7f8e85d407bd9e81d44d07"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga132ae2c2cadcf64c8bb0f3bdf69da3ed"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga132ae2c2cadcf64c8bb0f3bdf69da3ed">tjDecodeYUV</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle, unsigned char *srcBuf, int pad, int subsamp, unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
+<tr class="memdesc:ga132ae2c2cadcf64c8bb0f3bdf69da3ed"><td class="mdescLeft">&#160;</td><td class="mdescRight">Decode a YUV planar image into an RGB or grayscale image.  <a href="#ga132ae2c2cadcf64c8bb0f3bdf69da3ed">More...</a><br/></td></tr>
+<tr class="separator:ga132ae2c2cadcf64c8bb0f3bdf69da3ed"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga3155b775bfbac9dbba869b95a0367902"><td class="memItemLeft" align="right" valign="top">DLLEXPORT <a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga3155b775bfbac9dbba869b95a0367902">tjInitTransform</a> (void)</td></tr>
 <tr class="memdesc:ga3155b775bfbac9dbba869b95a0367902"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a new TurboJPEG transformer instance.  <a href="#ga3155b775bfbac9dbba869b95a0367902">More...</a><br/></td></tr>
 <tr class="separator:ga3155b775bfbac9dbba869b95a0367902"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -292,6 +302,20 @@
 <p>TurboJPEG API. </p>
 <p>This API provides an interface for generating, decoding, and transforming planar YUV and JPEG images in memory. </p>
 <h2 class="groupheader">Macro Definition Documentation</h2>
+<a class="anchor" id="ga39f57a6fb02d9cf32e7b6890099b5a71"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define TJ_NUMCS</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>The number of JPEG colorspaces. </p>
+
+</div>
+</div>
 <a class="anchor" id="ga7010a4402f54a45ba822ad8675a4655e"></a>
 <div class="memitem">
 <div class="memproto">
@@ -393,62 +417,6 @@
 
 </div>
 </div>
-<a class="anchor" id="ga4e872f11c82f241736fa8297920f24e5"></a>
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">#define TJFLAG_FORCEMMX</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-<p>Turn off CPU auto-detection and force TurboJPEG to use MMX code (if the underlying codec supports it.) </p>
-
-</div>
-</div>
-<a class="anchor" id="gae17e63189e8cd730feed3efbd2454f38"></a>
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">#define TJFLAG_FORCESSE</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-<p>Turn off CPU auto-detection and force TurboJPEG to use SSE code (if the underlying codec supports it.) </p>
-
-</div>
-</div>
-<a class="anchor" id="ga8cf0bca96ea4d472563f4b0ebf8c48e7"></a>
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">#define TJFLAG_FORCESSE2</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-<p>Turn off CPU auto-detection and force TurboJPEG to use SSE2 code (if the underlying codec supports it.) </p>
-
-</div>
-</div>
-<a class="anchor" id="gaf9d49066633404da4386d70820295dd2"></a>
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">#define TJFLAG_FORCESSE3</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-<p>Turn off CPU auto-detection and force TurboJPEG to use SSE3 code (if the underlying codec supports it.) </p>
-
-</div>
-</div>
 <a class="anchor" id="ga8808d403c68b62aaa58a4c1e58e98963"></a>
 <div class="memitem">
 <div class="memproto">
@@ -460,7 +428,7 @@
 </div><div class="memdoc">
 
 <p>Disable buffer (re)allocation. </p>
-<p>If passed to <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB or grayscale image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>, this flag will cause those functions to generate an error if the JPEG image buffer is invalid or too small rather than attempting to allocate or reallocate that buffer. This reproduces the behavior of earlier versions of TurboJPEG. </p>
+<p>If passed to <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a>, this flag will cause those functions to generate an error if the JPEG image buffer is invalid or too small rather than attempting to allocate or reallocate that buffer. This reproduces the behavior of earlier versions of TurboJPEG. </p>
 
 </div>
 </div>
@@ -613,6 +581,42 @@
 </div>
 </div>
 <h2 class="groupheader">Enumeration Type Documentation</h2>
+<a class="anchor" id="ga4f83ad3368e0e29d1957be0efa7c3720"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">enum <a class="el" href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">TJCS</a></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>JPEG colorspaces. </p>
+<table class="fieldtable">
+<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555"></a>TJCS_RGB</em>&nbsp;</td><td class="fielddoc">
+<p>RGB colorspace. </p>
+<p>When compressing the JPEG image, the R, G, and B components in the source image are reordered into image planes, but no colorspace conversion or subsampling is performed. RGB JPEG images can be decompressed to any of the extended RGB pixel formats or grayscale, but they cannot be decompressed to YUV images. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75"></a>TJCS_YCbCr</em>&nbsp;</td><td class="fielddoc">
+<p>YCbCr colorspace. </p>
+<p>YCbCr is not an absolute colorspace but rather a mathematical transformation of RGB designed solely for storage and transmission. YCbCr images must be converted to RGB before they can actually be displayed. In the YCbCr colorspace, the Y (luminance) component represents the black &amp; white portion of the original image, and the Cb and Cr (chrominance) components represent the color portion of the original image. Originally, the analog equivalent of this transformation allowed the same signal to drive both black &amp; white and color televisions, but JPEG images use YCbCr primarily because it allows the color data to be optionally subsampled for the purposes of reducing bandwidth or disk space. YCbCr is the most common JPEG colorspace, and YCbCr JPEG images can be compressed from and decompressed to any of the extended RGB pixel formats or grayscale, or they can be decompressed to YUV planar images. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a"></a>TJCS_GRAY</em>&nbsp;</td><td class="fielddoc">
+<p>Grayscale colorspace. </p>
+<p>The JPEG image retains only the luminance data (Y component), and any color data from the source image is discarded. Grayscale JPEG images can be compressed from and decompressed to any of the extended RGB pixel formats or grayscale, or they can be decompressed to YUV planar images. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53"></a>TJCS_CMYK</em>&nbsp;</td><td class="fielddoc">
+<p>CMYK colorspace. </p>
+<p>When compressing the JPEG image, the C, M, Y, and K components in the source image are reordered into image planes, but no colorspace conversion or subsampling is performed. CMYK JPEG images can only be decompressed to CMYK pixels. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e"></a>TJCS_YCCK</em>&nbsp;</td><td class="fielddoc">
+<p>YCCK colorspace. </p>
+<p>YCCK (AKA "YCbCrK") is not an absolute colorspace but rather a mathematical transformation of CMYK designed solely for storage and transmission. It is to CMYK as YCbCr is to RGB. CMYK pixels can be reversibly transformed into YCCK, and as with YCbCr, the chrominance components in the YCCK pixels can be subsampled without incurring major perceptual loss. YCCK JPEG images can only be compressed from and decompressed to CMYK pixels. </p>
+</td></tr>
+</table>
+
+</div>
+</div>
 <a class="anchor" id="gac916144e26c3817ac514e64ae5d12e2a"></a>
 <div class="memitem">
 <div class="memproto">
@@ -669,6 +673,10 @@
 <p>ARGB pixel format. </p>
 <p>This is the same as <a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aadae996905efcfa3b42a0bb3bea7f9d84">TJPF_XRGB</a>, except that when decompressing, the X component is guaranteed to be 0xFF, which can be interpreted as an opaque alpha channel. </p>
 </td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b"></a>TJPF_CMYK</em>&nbsp;</td><td class="fielddoc">
+<p>CMYK pixel format. </p>
+<p>Unlike RGB, which is an additive color model used primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive color model used primarily for printing. In the CMYK color model, the value of each color component typically corresponds to an amount of cyan, magenta, yellow, or black ink that is applied to a white background. In order to convert between CMYK and RGB, it is necessary to use a color management system (CMS.) A CMS will attempt to map colors within the printer's gamut to perceptually similar colors in the display's gamut and vice versa, but the mapping is typically not 1:1 or reversible, nor can it be defined with a simple formula. Thus, such a conversion is out of scope for a codec library. However, the TurboJPEG API allows for compressing CMYK pixels into a YCCK JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e" title="YCCK colorspace.">TJCS_YCCK</a>) and decompressing YCCK JPEG images into CMYK pixels. </p>
+</td></tr>
 </table>
 
 </div>
@@ -684,7 +692,7 @@
 </div><div class="memdoc">
 
 <p>Chrominance subsampling options. </p>
-<p>When an image is converted from the RGB to the YCbCr colorspace as part of the JPEG compression process, some of the Cb and Cr (chrominance) components can be discarded or averaged together to produce a smaller image with little perceptible loss of image clarity (the human eye is more sensitive to small changes in brightness than small changes in color.) This is called "chrominance subsampling". </p>
+<p>When pixels are converted from RGB to YCbCr (see <a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75" title="YCbCr colorspace.">TJCS_YCbCr</a>) or from CMYK to YCCK (see <a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e" title="YCCK colorspace.">TJCS_YCCK</a>) as part of the JPEG compression process, some of the Cb and Cr (chrominance) components can be discarded or averaged together to produce a smaller image with little perceptible loss of image clarity (the human eye is more sensitive to small changes in brightness than to small changes in color.) This is called "chrominance subsampling". </p>
 <p>NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the convention of the digital video community, the TurboJPEG API uses "YUV" to refer to an image format consisting of Y, Cb, and Cr image planes. </p>
 <table class="fieldtable">
 <tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="gga1d047060ea80bb9820d540bb928e9074afb8da4f44197837bdec0a4f593dacae3"></a>TJSAMP_444</em>&nbsp;</td><td class="fielddoc">
@@ -707,6 +715,10 @@
 <p>4:4:0 chrominance subsampling. </p>
 <p>The JPEG or YUV image will contain one chrominance component for every 1x2 block of pixels in the source image. Note that 4:4:0 subsampling is not fully accelerated in libjpeg-turbo. </p>
 </td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2"></a>TJSAMP_411</em>&nbsp;</td><td class="fielddoc">
+<p>4:1:1 chrominance subsampling. </p>
+<p>The JPEG or YUV image will contain one chrominance component for every 4x1 block of pixels in the source image. JPEG images compressed with 4:1:1 subsampling will be almost exactly the same size as those compressed with 4:2:0 subsampling, and in the aggregate, both subsampling methods produce approximately the same perceptual quality. However, 4:1:1 is better able to reproduce sharp horizontal features. Note that 4:1:1 subsampling is not fully accelerated in libjpeg-turbo. </p>
+</td></tr>
 </table>
 
 </div>
@@ -772,7 +784,7 @@
 </div><div class="memdoc">
 
 <p>Allocate an image buffer for use with TurboJPEG. </p>
-<p>You should always use this function to allocate the JPEG destination buffer(s) for <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB or grayscale image into a JPEG image.">tjCompress2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> unless you are disabling automatic buffer (re)allocation (by setting <a class="el" href="group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963" title="Disable buffer (re)allocation.">TJFLAG_NOREALLOC</a>.)</p>
+<p>You should always use this function to allocate the JPEG destination buffer(s) for <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> unless you are disabling automatic buffer (re)allocation (by setting <a class="el" href="group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963" title="Disable buffer (re)allocation.">TJFLAG_NOREALLOC</a>.)</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">bytes</td><td>the number of bytes to allocate</td></tr>
@@ -828,12 +840,12 @@
 
 </div>
 </div>
-<a class="anchor" id="ga9d0cb06fd5052d21b6f2b382db8b219c"></a>
+<a class="anchor" id="gaf451664a62c1f6c7cc5a6401f32908c9"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT unsigned long DLLCALL tjBufSizeYUV </td>
+          <td class="memname">DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2 </td>
           <td>(</td>
           <td class="paramtype">int&#160;</td>
           <td class="paramname"><em>width</em>, </td>
@@ -842,6 +854,12 @@
           <td class="paramkey"></td>
           <td></td>
           <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pad</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
           <td class="paramname"><em>height</em>, </td>
         </tr>
         <tr>
@@ -862,6 +880,7 @@
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">width</td><td>width of the image (in pixels) </td></tr>
+    <tr><td class="paramname">pad</td><td>the width of each line in each plane of the image is padded to the nearest multiple of this number of bytes (must be a power of 2.) </td></tr>
     <tr><td class="paramname">height</td><td>height of the image (in pixels) </td></tr>
     <tr><td class="paramname">subsamp</td><td>level of chrominance subsampling in the image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.)</td></tr>
   </table>
@@ -949,11 +968,11 @@
       </table>
 </div><div class="memdoc">
 
-<p>Compress an RGB or grayscale image into a JPEG image. </p>
+<p>Compress an RGB, grayscale, or CMYK image into a JPEG image. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance </td></tr>
-    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB or grayscale pixels to be compressed </td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing RGB, grayscale, or CMYK pixels to be compressed </td></tr>
     <tr><td class="paramname">width</td><td>width (in pixels) of the source image </td></tr>
     <tr><td class="paramname">pitch</td><td>bytes per line of the source image. Normally, this should be <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the image is unpadded, or <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. You can also be clever and use this parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>. </td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image </td></tr>
@@ -975,6 +994,197 @@
 
 </div>
 </div>
+<a class="anchor" id="ga0b931126c7a615ddc3bbd0cca6698d67"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">DLLEXPORT int DLLCALL tjCompressFromYUV </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
+          <td class="paramname"><em>handle</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramname"><em>srcBuf</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>width</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pad</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>height</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>subsamp</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">unsigned char **&#160;</td>
+          <td class="paramname"><em>jpegBuf</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">unsigned long *&#160;</td>
+          <td class="paramname"><em>jpegSize</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>jpegQual</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>flags</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Compress a YUV planar image into a JPEG image. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor or transformer instance </td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing a YUV planar image to be compressed. The Y, U (Cb), and V (Cr) image planes should be stored sequentially in the buffer, and the size of each plane is determined by the specified width, height, padding, and level of chrominance subsampling. If the chrominance components are subsampled along the horizontal dimension, then the width of the luminance plane should be padded to the nearest multiple of 2 (same goes for the height of the luminance plane, if the chrominance components are subsampled along the vertical dimension.) This is irrespective of any additional padding specified in the <code>pad</code> parameter. </td></tr>
+    <tr><td class="paramname">width</td><td>width (in pixels) of the source image </td></tr>
+    <tr><td class="paramname">pad</td><td>the line padding used in the source image. For instance, if each line in each plane of the YUV image is padded to the nearest multiple of 4 bytes, then <code>pad</code> should be set to 4. </td></tr>
+    <tr><td class="paramname">height</td><td>height (in pixels) of the source image </td></tr>
+    <tr><td class="paramname">subsamp</td><td>the level of chrominance subsampling used in the source image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.) </td></tr>
+    <tr><td class="paramname">jpegBuf</td><td>address of a pointer to an image buffer that will receive the JPEG image. TurboJPEG has the ability to reallocate the JPEG buffer to accommodate the size of the JPEG image. Thus, you can choose to:<ol type="1">
+<li>pre-allocate the JPEG buffer with an arbitrary size using <a class="el" href="group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff" title="Allocate an image buffer for use with TurboJPEG.">tjAlloc()</a> and let TurboJPEG grow the buffer as needed,</li>
+<li>set <code>*jpegBuf</code> to NULL to tell TurboJPEG to allocate the buffer for you, or</li>
+<li>pre-allocate the buffer to a "worst case" size determined by calling <a class="el" href="group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b" title="The maximum size of the buffer (in bytes) required to hold a JPEG image with the given parameters...">tjBufSize()</a>. This should ensure that the buffer never has to be re-allocated (setting <a class="el" href="group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963" title="Disable buffer (re)allocation.">TJFLAG_NOREALLOC</a> guarantees this.)</li>
+</ol>
+If you choose option 1, <code>*jpegSize</code> should be set to the size of your pre-allocated buffer. In any case, unless you have set <a class="el" href="group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963" title="Disable buffer (re)allocation.">TJFLAG_NOREALLOC</a>, you should always check <code>*jpegBuf</code> upon return from this function, as it may have changed. </td></tr>
+    <tr><td class="paramname">jpegSize</td><td>pointer to an unsigned long variable that holds the size of the JPEG image buffer. If <code>*jpegBuf</code> points to a pre-allocated buffer, then <code>*jpegSize</code> should be set to the size of the buffer. Upon return, <code>*jpegSize</code> will contain the size of the JPEG image (in bytes.) </td></tr>
+    <tr><td class="paramname">jpegQual</td><td>the image quality of the generated JPEG image (1 = worst, 100 = best) </td></tr>
+    <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+
+</div>
+</div>
+<a class="anchor" id="ga132ae2c2cadcf64c8bb0f3bdf69da3ed"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">DLLEXPORT int DLLCALL tjDecodeYUV </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
+          <td class="paramname"><em>handle</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramname"><em>srcBuf</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pad</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>subsamp</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramname"><em>dstBuf</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>width</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pitch</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>height</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pixelFormat</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>flags</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Decode a YUV planar image into an RGB or grayscale image. </p>
+<p>This function uses the accelerated color conversion routines in the underlying codec but does not execute any of the other steps in the JPEG decompression process. The Y, U (Cb), and V (Cr) image planes should be stored sequentially in the source buffer, and the size of each plane is determined by the width and height of the source image, as well as the specified padding and level of chrominance subsampling. If the chrominance components are subsampled along the horizontal dimension, then the width of the luminance plane should be padded to the nearest multiple of 2 in the input image (same goes for the height of the luminance plane, if the chrominance components are subsampled along the vertical dimension.) </p>
+<p>NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the convention of the digital video community, the TurboJPEG API uses "YUV" to refer to an image format consisting of Y, Cb, and Cr image planes.</p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance </td></tr>
+    <tr><td class="paramname">srcBuf</td><td>pointer to an image buffer containing a YUV planar image to be decoded. The size of this buffer should match the value returned by <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> for the given image width, height, padding, and level of chrominance subsampling. </td></tr>
+    <tr><td class="paramname">pad</td><td>Use this parameter to specify that the width of each line in each plane of the YUV source image is padded to the nearest multiple of this number of bytes (must be a power of 2.) </td></tr>
+    <tr><td class="paramname">subsamp</td><td>the level of chrominance subsampling used in the YUV source image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.) </td></tr>
+    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the decoded image. This buffer should normally be <code>pitch * height</code> bytes in size, but the <code>dstBuf</code> pointer can also be used to decode into a specific region of a larger buffer. </td></tr>
+    <tr><td class="paramname">width</td><td>width (in pixels) of the source and destination images </td></tr>
+    <tr><td class="paramname">pitch</td><td>bytes per line of the destination image. Normally, this should be <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the destination image is unpadded, or <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the destination image should be padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. You can also be clever and use the pitch parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>. </td></tr>
+    <tr><td class="paramname">height</td><td>height (in pixels) of the source and destination images </td></tr>
+    <tr><td class="paramname">pixelFormat</td><td>pixel format of the destination image (see <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">Pixel formats</a>.) </td></tr>
+    <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+
+</div>
+</div>
 <a class="anchor" id="gada69cc6443d1bb493b40f1626259e5e9"></a>
 <div class="memitem">
 <div class="memproto">
@@ -1041,7 +1251,7 @@
       </table>
 </div><div class="memdoc">
 
-<p>Decompress a JPEG image to an RGB or grayscale image. </p>
+<p>Decompress a JPEG image to an RGB, grayscale, or CMYK image. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance </td></tr>
@@ -1049,26 +1259,23 @@
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes) </td></tr>
     <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the decompressed image. This buffer should normally be <code>pitch * scaledHeight</code> bytes in size, where <code>scaledHeight</code> can be determined by calling <a class="el" href="group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df" title="Compute the scaled value of dimension using the given scaling factor.">TJSCALED()</a> with the JPEG image height and one of the scaling factors returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8" title="Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of Tur...">tjGetScalingFactors()</a>. The <code>dstBuf</code> pointer may also be used to decompress into a specific region of a larger buffer. </td></tr>
     <tr><td class="paramname">width</td><td>desired width (in pixels) of the destination image. If this is different than the width of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired width. If <code>width</code> is set to 0, then only the height will be considered when determining the scaled image size. </td></tr>
-    <tr><td class="paramname">pitch</td><td>bytes per line of the destination image. Normally, this is <code>scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the decompressed image is unpadded, else <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the decompressed image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. (NOTE: <code>scaledWidth</code> can be determined by calling <a class="el" href="group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df" title="Compute the scaled value of dimension using the given scaling factor.">TJSCALED()</a> with the JPEG image width and one of the scaling factors returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8" title="Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of Tur...">tjGetScalingFactors()</a>.) You can also be clever and use the pitch parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>scaledWidth<ul>
-<li><a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]. </li>
-</ul>
-</code></td></tr>
-    <tr><td class="paramname">height</td><td><code>desired height (in pixels) of the destination image. If this is different than the height of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired height. If <code>height</code> is set to 0, then only the width will be considered when determining the scaled image size. </code></td></tr>
-    <tr><td class="paramname">pixelFormat</td><td><code>pixel format of the destination image (see <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">Pixel formats</a>.) </code></td></tr>
-    <tr><td class="paramname">flags</td><td><code>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</code></td></tr>
+    <tr><td class="paramname">pitch</td><td>bytes per line of the destination image. Normally, this is <code>scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the decompressed image is unpadded, else <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the decompressed image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. (NOTE: <code>scaledWidth</code> can be determined by calling <a class="el" href="group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df" title="Compute the scaled value of dimension using the given scaling factor.">TJSCALED()</a> with the JPEG image width and one of the scaling factors returned by <a class="el" href="group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8" title="Returns a list of fractional scaling factors that the JPEG decompressor in this implementation of Tur...">tjGetScalingFactors()</a>.) You can also be clever and use the pitch parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>scaledWidth * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>. </td></tr>
+    <tr><td class="paramname">height</td><td>desired height (in pixels) of the destination image. If this is different than the height of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired height. If <code>height</code> is set to 0, then only the width will be considered when determining the scaled image size. </td></tr>
+    <tr><td class="paramname">pixelFormat</td><td>pixel format of the destination image (see <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">Pixel formats</a>.) </td></tr>
+    <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd><code> 0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </code></dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
 
 </div>
 </div>
-<a class="anchor" id="gac5675fceb7997b385516cdffdb34e6aa"></a>
+<a class="anchor" id="gacd0fac3af74b3511d39b4781b7103086"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT int DLLCALL tjDecompressHeader2 </td>
+          <td class="memname">DLLEXPORT int DLLCALL tjDecompressHeader3 </td>
           <td>(</td>
           <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
           <td class="paramname"><em>handle</em>, </td>
@@ -1101,7 +1308,13 @@
           <td class="paramkey"></td>
           <td></td>
           <td class="paramtype">int *&#160;</td>
-          <td class="paramname"><em>jpegSubsamp</em>&#160;</td>
+          <td class="paramname"><em>jpegSubsamp</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int *&#160;</td>
+          <td class="paramname"><em>jpegColorspace</em>&#160;</td>
         </tr>
         <tr>
           <td></td>
@@ -1119,7 +1332,8 @@
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes) </td></tr>
     <tr><td class="paramname">width</td><td>pointer to an integer variable that will receive the width (in pixels) of the JPEG image </td></tr>
     <tr><td class="paramname">height</td><td>pointer to an integer variable that will receive the height (in pixels) of the JPEG image </td></tr>
-    <tr><td class="paramname">jpegSubsamp</td><td>pointer to an integer variable that will receive the level of chrominance subsampling used when compressing the JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.)</td></tr>
+    <tr><td class="paramname">jpegSubsamp</td><td>pointer to an integer variable that will receive the level of chrominance subsampling used when compressing the JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.) </td></tr>
+    <tr><td class="paramname">jpegColorspace</td><td>pointer to an integer variable that will receive one of the JPEG colorspace constants, indicating the colorspace of the JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">JPEG colorspaces</a>.)</td></tr>
   </table>
   </dd>
 </dl>
@@ -1127,12 +1341,12 @@
 
 </div>
 </div>
-<a class="anchor" id="gad7810af095624a4016e72957a50f77d8"></a>
+<a class="anchor" id="ga7c08b340ad7f8e85d407bd9e81d44d07"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT int DLLCALL tjDecompressToYUV </td>
+          <td class="memname">DLLEXPORT int DLLCALL tjDecompressToYUV2 </td>
           <td>(</td>
           <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
           <td class="paramname"><em>handle</em>, </td>
@@ -1159,6 +1373,24 @@
           <td class="paramkey"></td>
           <td></td>
           <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>width</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pad</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>height</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
           <td class="paramname"><em>flags</em>&#160;</td>
         </tr>
         <tr>
@@ -1170,14 +1402,17 @@
 </div><div class="memdoc">
 
 <p>Decompress a JPEG image to a YUV planar image. </p>
-<p>This function performs JPEG decompression but leaves out the color conversion step, so a planar YUV image is generated instead of an RGB image. The padding of the planes in this image is the same as in the images generated by <a class="el" href="group___turbo_j_p_e_g.html#ga0fa4e7b1943687c6a0c0304529c55d35" title="Encode an RGB or grayscale image into a YUV planar image.">tjEncodeYUV2()</a>. Note that, if the width or height of the image is not an even multiple of the MCU block size (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a> and <a class="el" href="group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf" title="MCU block height (in pixels) for a given level of chrominance subsampling.">tjMCUHeight</a>), then an intermediate buffer copy will be performed within TurboJPEG. </p>
+<p>This function performs JPEG decompression but leaves out the color conversion step, so a planar YUV image is generated instead of an RGB image. The structure of the planes in this image is the same as in the images generated by <a class="el" href="group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360" title="Encode an RGB or grayscale image into a YUV planar image.">tjEncodeYUV3()</a>. Note that, if the width or height of the JPEG image is not an even multiple of the MCU block size (see <a class="el" href="group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c" title="MCU block width (in pixels) for a given level of chrominance subsampling.">tjMCUWidth</a> and <a class="el" href="group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf" title="MCU block height (in pixels) for a given level of chrominance subsampling.">tjMCUHeight</a>), then an intermediate buffer copy will be performed within TurboJPEG. </p>
 <p>NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the convention of the digital video community, the TurboJPEG API uses "YUV" to refer to an image format consisting of Y, Cb, and Cr image planes.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG decompressor or transformer instance </td></tr>
     <tr><td class="paramname">jpegBuf</td><td>pointer to a buffer containing the JPEG image to decompress </td></tr>
     <tr><td class="paramname">jpegSize</td><td>size of the JPEG image (in bytes) </td></tr>
-    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV()</a> to determine the appropriate size for this buffer based on the image width, height, and level of subsampling. </td></tr>
+    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> to determine the appropriate size for this buffer based on the image width, height, padding, and level of subsampling. </td></tr>
+    <tr><td class="paramname">width</td><td>desired width (in pixels) of the YUV image. If this is different than the width of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired width. If <code>width</code> is set to 0, then only the height will be considered when determining the scaled image size. </td></tr>
+    <tr><td class="paramname">pad</td><td>the width of each line in each plane of the YUV image will be padded to the nearest multiple of this number of bytes (must be a power of 2.) To generate images suitable for X Video, <code>pad</code> should be set to 4. </td></tr>
+    <tr><td class="paramname">height</td><td>desired height (in pixels) of the YUV image. If this is different than the height of the JPEG image being decompressed, then TurboJPEG will use scaling in the JPEG decompressor to generate the largest possible image that will fit within the desired height. If <code>height</code> is set to 0, then only the width will be considered when determining the scaled image size. </td></tr>
     <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
   </table>
   </dd>
@@ -1211,12 +1446,12 @@
 
 </div>
 </div>
-<a class="anchor" id="ga0fa4e7b1943687c6a0c0304529c55d35"></a>
+<a class="anchor" id="ga0a5ffbf7cb58a5b6a8201114fe889360"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT int DLLCALL tjEncodeYUV2 </td>
+          <td class="memname">DLLEXPORT int DLLCALL tjEncodeYUV3 </td>
           <td>(</td>
           <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
           <td class="paramname"><em>handle</em>, </td>
@@ -1261,6 +1496,12 @@
           <td class="paramkey"></td>
           <td></td>
           <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pad</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
           <td class="paramname"><em>subsamp</em>, </td>
         </tr>
         <tr>
@@ -1278,7 +1519,7 @@
 </div><div class="memdoc">
 
 <p>Encode an RGB or grayscale image into a YUV planar image. </p>
-<p>This function uses the accelerated color conversion routines in TurboJPEG's underlying codec to produce a planar YUV image that is suitable for X Video. Specifically, if the chrominance components are subsampled along the horizontal dimension, then the width of the luminance plane is padded to the nearest multiple of 2 in the output image (same goes for the height of the luminance plane, if the chrominance components are subsampled along the vertical dimension.) Also, each line of each plane in the output image is padded to 4 bytes. Although this will work with any subsampling option, it is really only useful in combination with TJ_420, which produces an image compatible with the I420 (AKA "YUV420P") format. </p>
+<p>This function uses the accelerated color conversion routines in the underlying codec but does not execute any of the other steps in the JPEG compression process. The Y, U (Cb), and V (Cr) image planes are stored sequentially into the destination buffer, and the size of each plane is determined by the width and height of the source image, as well as the specified padding and level of chrominance subsampling. If the chrominance components are subsampled along the horizontal dimension, then the width of the luminance plane is padded to the nearest multiple of 2 in the output image (same goes for the height of the luminance plane, if the chrominance components are subsampled along the vertical dimension.) </p>
 <p>NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the convention of the digital video community, the TurboJPEG API uses "YUV" to refer to an image format consisting of Y, Cb, and Cr image planes.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -1288,8 +1529,9 @@
     <tr><td class="paramname">pitch</td><td>bytes per line of the source image. Normally, this should be <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code> if the image is unpadded, or <code><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511" title="Pad the given width to the nearest 32-bit boundary.">TJPAD</a>(width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat])</code> if each line of the image is padded to the nearest 32-bit boundary, as is the case for Windows bitmaps. You can also be clever and use this parameter to skip lines, etc. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>. </td></tr>
     <tr><td class="paramname">height</td><td>height (in pixels) of the source image </td></tr>
     <tr><td class="paramname">pixelFormat</td><td>pixel format of the source image (see <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">Pixel formats</a>.) </td></tr>
-    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV()</a> to determine the appropriate size for this buffer based on the image width, height, and level of chrominance subsampling. </td></tr>
-    <tr><td class="paramname">subsamp</td><td>the level of chrominance subsampling to be used when generating the YUV image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.) </td></tr>
+    <tr><td class="paramname">dstBuf</td><td>pointer to an image buffer that will receive the YUV image. Use <a class="el" href="group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9" title="The size of the buffer (in bytes) required to hold a YUV planar image with the given parameters...">tjBufSizeYUV2()</a> to determine the appropriate size for this buffer based on the image width, height, padding, and level of chrominance subsampling. </td></tr>
+    <tr><td class="paramname">pad</td><td>the width of each line in each plane of the YUV image will be padded to the nearest multiple of this number of bytes (must be a power of 2.) To generate images suitable for X Video, <code>pad</code> should be set to 4. </td></tr>
+    <tr><td class="paramname">subsamp</td><td>the level of chrominance subsampling to be used when generating the YUV image (see <a class="el" href="group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074">Chrominance subsampling options</a>.) To generate images suitable for X Video, <code>subsamp</code> should be set to <a class="el" href="group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a63085dbf683cfe39e513cdb6343e3737">TJSAMP_420</a>. This produces an image compatible with the I420 (AKA "YUV420P") format. </td></tr>
     <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
   </table>
   </dd>
@@ -1313,7 +1555,7 @@
 </div><div class="memdoc">
 
 <p>Free an image buffer previously allocated by TurboJPEG. </p>
-<p>You should always use this function to free JPEG destination buffer(s) that were automatically (re)allocated by <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB or grayscale image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> or that were manually allocated using <a class="el" href="group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff" title="Allocate an image buffer for use with TurboJPEG.">tjAlloc()</a>.</p>
+<p>You should always use this function to free JPEG destination buffer(s) that were automatically (re)allocated by <a class="el" href="group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2" title="Compress an RGB, grayscale, or CMYK image into a JPEG image.">tjCompress2()</a> or <a class="el" href="group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> or that were manually allocated using <a class="el" href="group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff" title="Allocate an image buffer for use with TurboJPEG.">tjAlloc()</a>.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramname">buffer</td><td>address of the buffer to free</td></tr>
@@ -1580,7 +1822,8 @@
 <li>8x8 for no subsampling or grayscale</li>
 <li>16x8 for 4:2:2</li>
 <li>8x16 for 4:4:0</li>
-<li>16x16 for 4:2:0 </li>
+<li>16x16 for 4:2:0</li>
+<li>32x8 for 4:1:1 </li>
 </ul>
 
 </div>
@@ -1609,7 +1852,8 @@
 <li>8x8 for no subsampling or grayscale</li>
 <li>16x8 for 4:2:2</li>
 <li>8x16 for 4:4:0</li>
-<li>16x16 for 4:2:0 </li>
+<li>16x16 for 4:2:0</li>
+<li>32x8 for 4:1:1 </li>
 </ul>
 
 </div>
diff --git a/doc/html/index.html b/doc/html/index.html
index 72daeb8..139b84c 100644
--- a/doc/html/index.html
+++ b/doc/html/index.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/modules.html b/doc/html/modules.html
index 25b7ec6..6b769e4 100644
--- a/doc/html/modules.html
+++ b/doc/html/modules.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/search/all_74.js b/doc/html/search/all_74.js
index a1927ba..435cec4 100644
--- a/doc/html/search/all_74.js
+++ b/doc/html/search/all_74.js
@@ -1,26 +1,31 @@
 var searchData=
 [
+  ['tj_5fnumcs',['TJ_NUMCS',['../group___turbo_j_p_e_g.html#ga39f57a6fb02d9cf32e7b6890099b5a71',1,'turbojpeg.h']]],
   ['tj_5fnumpf',['TJ_NUMPF',['../group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e',1,'turbojpeg.h']]],
   ['tj_5fnumsamp',['TJ_NUMSAMP',['../group___turbo_j_p_e_g.html#ga5ef3d169162ce77ce348e292a0b7477c',1,'turbojpeg.h']]],
   ['tj_5fnumxop',['TJ_NUMXOP',['../group___turbo_j_p_e_g.html#ga0f6dbd18adf38b7d46ac547f0f4d562c',1,'turbojpeg.h']]],
   ['tjalloc',['tjAlloc',['../group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff',1,'turbojpeg.h']]],
   ['tjblueoffset',['tjBlueOffset',['../group___turbo_j_p_e_g.html#ga84e2e35d3f08025f976ec1ec53693dea',1,'turbojpeg.h']]],
   ['tjbufsize',['tjBufSize',['../group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b',1,'turbojpeg.h']]],
-  ['tjbufsizeyuv',['tjBufSizeYUV',['../group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c',1,'turbojpeg.h']]],
+  ['tjbufsizeyuv2',['tjBufSizeYUV2',['../group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9',1,'turbojpeg.h']]],
   ['tjcompress2',['tjCompress2',['../group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2',1,'turbojpeg.h']]],
+  ['tjcompressfromyuv',['tjCompressFromYUV',['../group___turbo_j_p_e_g.html#ga0b931126c7a615ddc3bbd0cca6698d67',1,'turbojpeg.h']]],
+  ['tjcs',['TJCS',['../group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720',1,'turbojpeg.h']]],
+  ['tjcs_5fcmyk',['TJCS_CMYK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53',1,'turbojpeg.h']]],
+  ['tjcs_5fgray',['TJCS_GRAY',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a',1,'turbojpeg.h']]],
+  ['tjcs_5frgb',['TJCS_RGB',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555',1,'turbojpeg.h']]],
+  ['tjcs_5fycbcr',['TJCS_YCbCr',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75',1,'turbojpeg.h']]],
+  ['tjcs_5fycck',['TJCS_YCCK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e',1,'turbojpeg.h']]],
+  ['tjdecodeyuv',['tjDecodeYUV',['../group___turbo_j_p_e_g.html#ga132ae2c2cadcf64c8bb0f3bdf69da3ed',1,'turbojpeg.h']]],
   ['tjdecompress2',['tjDecompress2',['../group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9',1,'turbojpeg.h']]],
-  ['tjdecompressheader2',['tjDecompressHeader2',['../group___turbo_j_p_e_g.html#gac5675fceb7997b385516cdffdb34e6aa',1,'turbojpeg.h']]],
-  ['tjdecompresstoyuv',['tjDecompressToYUV',['../group___turbo_j_p_e_g.html#gad7810af095624a4016e72957a50f77d8',1,'turbojpeg.h']]],
+  ['tjdecompressheader3',['tjDecompressHeader3',['../group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086',1,'turbojpeg.h']]],
+  ['tjdecompresstoyuv2',['tjDecompressToYUV2',['../group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07',1,'turbojpeg.h']]],
   ['tjdestroy',['tjDestroy',['../group___turbo_j_p_e_g.html#ga674adee917b95ad4a896f1ba39e12540',1,'turbojpeg.h']]],
-  ['tjencodeyuv2',['tjEncodeYUV2',['../group___turbo_j_p_e_g.html#ga0fa4e7b1943687c6a0c0304529c55d35',1,'turbojpeg.h']]],
+  ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360',1,'turbojpeg.h']]],
   ['tjflag_5faccuratedct',['TJFLAG_ACCURATEDCT',['../group___turbo_j_p_e_g.html#gacb233cfd722d66d1ccbf48a7de81f0e0',1,'turbojpeg.h']]],
   ['tjflag_5fbottomup',['TJFLAG_BOTTOMUP',['../group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec',1,'turbojpeg.h']]],
   ['tjflag_5ffastdct',['TJFLAG_FASTDCT',['../group___turbo_j_p_e_g.html#gaabce235db80d3f698b27f36cbd453da2',1,'turbojpeg.h']]],
   ['tjflag_5ffastupsample',['TJFLAG_FASTUPSAMPLE',['../group___turbo_j_p_e_g.html#ga4ee4506c81177a06f77e2504a22efd2d',1,'turbojpeg.h']]],
-  ['tjflag_5fforcemmx',['TJFLAG_FORCEMMX',['../group___turbo_j_p_e_g.html#ga4e872f11c82f241736fa8297920f24e5',1,'turbojpeg.h']]],
-  ['tjflag_5fforcesse',['TJFLAG_FORCESSE',['../group___turbo_j_p_e_g.html#gae17e63189e8cd730feed3efbd2454f38',1,'turbojpeg.h']]],
-  ['tjflag_5fforcesse2',['TJFLAG_FORCESSE2',['../group___turbo_j_p_e_g.html#ga8cf0bca96ea4d472563f4b0ebf8c48e7',1,'turbojpeg.h']]],
-  ['tjflag_5fforcesse3',['TJFLAG_FORCESSE3',['../group___turbo_j_p_e_g.html#gaf9d49066633404da4386d70820295dd2',1,'turbojpeg.h']]],
   ['tjflag_5fnorealloc',['TJFLAG_NOREALLOC',['../group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963',1,'turbojpeg.h']]],
   ['tjfree',['tjFree',['../group___turbo_j_p_e_g.html#ga8c4a1231dc06a450514c835f6471f137',1,'turbojpeg.h']]],
   ['tjgeterrorstr',['tjGetErrorStr',['../group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf',1,'turbojpeg.h']]],
@@ -39,6 +44,7 @@
   ['tjpf_5fbgr',['TJPF_BGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aab10624437fb8ef495a0b153e65749839',1,'turbojpeg.h']]],
   ['tjpf_5fbgra',['TJPF_BGRA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aac037ff1845cf9b74bb81a3659c2b9fb4',1,'turbojpeg.h']]],
   ['tjpf_5fbgrx',['TJPF_BGRX',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa2a1fbf569ca79897eae886e3376ca4c8',1,'turbojpeg.h']]],
+  ['tjpf_5fcmyk',['TJPF_CMYK',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b',1,'turbojpeg.h']]],
   ['tjpf_5fgray',['TJPF_GRAY',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa5431b54b015337705f13118073711a1a',1,'turbojpeg.h']]],
   ['tjpf_5frgb',['TJPF_RGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7ce93230bff449518ce387c17e6ed37c',1,'turbojpeg.h']]],
   ['tjpf_5frgba',['TJPF_RGBA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa88d2e88fab67f6503cf972e14851cc12',1,'turbojpeg.h']]],
@@ -49,6 +55,7 @@
   ['tjredoffset',['tjRedOffset',['../group___turbo_j_p_e_g.html#gadd9b446742ac8a3923f7992c7988fea8',1,'turbojpeg.h']]],
   ['tjregion',['tjregion',['../structtjregion.html',1,'']]],
   ['tjsamp',['TJSAMP',['../group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074',1,'turbojpeg.h']]],
+  ['tjsamp_5f411',['TJSAMP_411',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2',1,'turbojpeg.h']]],
   ['tjsamp_5f420',['TJSAMP_420',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a63085dbf683cfe39e513cdb6343e3737',1,'turbojpeg.h']]],
   ['tjsamp_5f422',['TJSAMP_422',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a136130902cc578f11f32429b59368404',1,'turbojpeg.h']]],
   ['tjsamp_5f440',['TJSAMP_440',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974',1,'turbojpeg.h']]],
@@ -56,7 +63,7 @@
   ['tjsamp_5fgray',['TJSAMP_GRAY',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a3f1c9504842ddc7a48d0f690754b6248',1,'turbojpeg.h']]],
   ['tjscaled',['TJSCALED',['../group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df',1,'turbojpeg.h']]],
   ['tjscalingfactor',['tjscalingfactor',['../structtjscalingfactor.html',1,'']]],
-  ['tjtransform',['tjtransform',['../structtjtransform.html',1,'tjtransform'],['../group___turbo_j_p_e_g.html#gaa29f3189c41be12ec5dee7caec318a31',1,'tjtransform():&#160;turbojpeg.h'],['../group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616',1,'tjTransform(tjhandle handle, unsigned char *jpegBuf, unsigned long jpegSize, int n, unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms, int flags):&#160;turbojpeg.h']]],
+  ['tjtransform',['tjtransform',['../structtjtransform.html',1,'tjtransform'],['../group___turbo_j_p_e_g.html#gae403193ceb4aafb7e0f56ab587b48616',1,'tjTransform(tjhandle handle, unsigned char *jpegBuf, unsigned long jpegSize, int n, unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms, int flags):&#160;turbojpeg.h'],['../group___turbo_j_p_e_g.html#gaa29f3189c41be12ec5dee7caec318a31',1,'tjtransform():&#160;turbojpeg.h']]],
   ['tjxop',['TJXOP',['../group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866',1,'turbojpeg.h']]],
   ['tjxop_5fhflip',['TJXOP_HFLIP',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aa0df69776caa30f0fa28e26332d311ce',1,'turbojpeg.h']]],
   ['tjxop_5fnone',['TJXOP_NONE',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aad88c0366cd3f7d0eac9d7a3fa1c2c27',1,'turbojpeg.h']]],
diff --git a/doc/html/search/enums_74.js b/doc/html/search/enums_74.js
index 20bd4db..276aa24 100644
--- a/doc/html/search/enums_74.js
+++ b/doc/html/search/enums_74.js
@@ -1,5 +1,6 @@
 var searchData=
 [
+  ['tjcs',['TJCS',['../group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720',1,'turbojpeg.h']]],
   ['tjpf',['TJPF',['../group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a',1,'turbojpeg.h']]],
   ['tjsamp',['TJSAMP',['../group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074',1,'turbojpeg.h']]],
   ['tjxop',['TJXOP',['../group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866',1,'turbojpeg.h']]]
diff --git a/doc/html/search/enumvalues_74.js b/doc/html/search/enumvalues_74.js
index 55664f1..7dc2f8d 100644
--- a/doc/html/search/enumvalues_74.js
+++ b/doc/html/search/enumvalues_74.js
@@ -1,16 +1,23 @@
 var searchData=
 [
+  ['tjcs_5fcmyk',['TJCS_CMYK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a6c8b636152ac8195b869587db315ee53',1,'turbojpeg.h']]],
+  ['tjcs_5fgray',['TJCS_GRAY',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720ab3e7d6a87f695e45b81c1b5262b5a50a',1,'turbojpeg.h']]],
+  ['tjcs_5frgb',['TJCS_RGB',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555',1,'turbojpeg.h']]],
+  ['tjcs_5fycbcr',['TJCS_YCbCr',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75',1,'turbojpeg.h']]],
+  ['tjcs_5fycck',['TJCS_YCCK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e',1,'turbojpeg.h']]],
   ['tjpf_5fabgr',['TJPF_ABGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa1ba1a7f1631dbeaa49a0a85fc4a40081',1,'turbojpeg.h']]],
   ['tjpf_5fargb',['TJPF_ARGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aae8f846ed9d9de99b6e1dfe448848765c',1,'turbojpeg.h']]],
   ['tjpf_5fbgr',['TJPF_BGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aab10624437fb8ef495a0b153e65749839',1,'turbojpeg.h']]],
   ['tjpf_5fbgra',['TJPF_BGRA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aac037ff1845cf9b74bb81a3659c2b9fb4',1,'turbojpeg.h']]],
   ['tjpf_5fbgrx',['TJPF_BGRX',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa2a1fbf569ca79897eae886e3376ca4c8',1,'turbojpeg.h']]],
+  ['tjpf_5fcmyk',['TJPF_CMYK',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b',1,'turbojpeg.h']]],
   ['tjpf_5fgray',['TJPF_GRAY',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa5431b54b015337705f13118073711a1a',1,'turbojpeg.h']]],
   ['tjpf_5frgb',['TJPF_RGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7ce93230bff449518ce387c17e6ed37c',1,'turbojpeg.h']]],
   ['tjpf_5frgba',['TJPF_RGBA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa88d2e88fab67f6503cf972e14851cc12',1,'turbojpeg.h']]],
   ['tjpf_5frgbx',['TJPF_RGBX',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa83973bebb7e2dc6fa8bae89ff3f42e01',1,'turbojpeg.h']]],
   ['tjpf_5fxbgr',['TJPF_XBGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aaf6603b27147de47e212e75dac027b2af',1,'turbojpeg.h']]],
   ['tjpf_5fxrgb',['TJPF_XRGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aadae996905efcfa3b42a0bb3bea7f9d84',1,'turbojpeg.h']]],
+  ['tjsamp_5f411',['TJSAMP_411',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2',1,'turbojpeg.h']]],
   ['tjsamp_5f420',['TJSAMP_420',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a63085dbf683cfe39e513cdb6343e3737',1,'turbojpeg.h']]],
   ['tjsamp_5f422',['TJSAMP_422',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a136130902cc578f11f32429b59368404',1,'turbojpeg.h']]],
   ['tjsamp_5f440',['TJSAMP_440',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974',1,'turbojpeg.h']]],
diff --git a/doc/html/search/functions_74.js b/doc/html/search/functions_74.js
index c746a91..0a0e6cd 100644
--- a/doc/html/search/functions_74.js
+++ b/doc/html/search/functions_74.js
@@ -2,13 +2,15 @@
 [
   ['tjalloc',['tjAlloc',['../group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff',1,'turbojpeg.h']]],
   ['tjbufsize',['tjBufSize',['../group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b',1,'turbojpeg.h']]],
-  ['tjbufsizeyuv',['tjBufSizeYUV',['../group___turbo_j_p_e_g.html#ga9d0cb06fd5052d21b6f2b382db8b219c',1,'turbojpeg.h']]],
+  ['tjbufsizeyuv2',['tjBufSizeYUV2',['../group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9',1,'turbojpeg.h']]],
   ['tjcompress2',['tjCompress2',['../group___turbo_j_p_e_g.html#gaba62b7a98f960839b588579898495cf2',1,'turbojpeg.h']]],
+  ['tjcompressfromyuv',['tjCompressFromYUV',['../group___turbo_j_p_e_g.html#ga0b931126c7a615ddc3bbd0cca6698d67',1,'turbojpeg.h']]],
+  ['tjdecodeyuv',['tjDecodeYUV',['../group___turbo_j_p_e_g.html#ga132ae2c2cadcf64c8bb0f3bdf69da3ed',1,'turbojpeg.h']]],
   ['tjdecompress2',['tjDecompress2',['../group___turbo_j_p_e_g.html#gada69cc6443d1bb493b40f1626259e5e9',1,'turbojpeg.h']]],
-  ['tjdecompressheader2',['tjDecompressHeader2',['../group___turbo_j_p_e_g.html#gac5675fceb7997b385516cdffdb34e6aa',1,'turbojpeg.h']]],
-  ['tjdecompresstoyuv',['tjDecompressToYUV',['../group___turbo_j_p_e_g.html#gad7810af095624a4016e72957a50f77d8',1,'turbojpeg.h']]],
+  ['tjdecompressheader3',['tjDecompressHeader3',['../group___turbo_j_p_e_g.html#gacd0fac3af74b3511d39b4781b7103086',1,'turbojpeg.h']]],
+  ['tjdecompresstoyuv2',['tjDecompressToYUV2',['../group___turbo_j_p_e_g.html#ga7c08b340ad7f8e85d407bd9e81d44d07',1,'turbojpeg.h']]],
   ['tjdestroy',['tjDestroy',['../group___turbo_j_p_e_g.html#ga674adee917b95ad4a896f1ba39e12540',1,'turbojpeg.h']]],
-  ['tjencodeyuv2',['tjEncodeYUV2',['../group___turbo_j_p_e_g.html#ga0fa4e7b1943687c6a0c0304529c55d35',1,'turbojpeg.h']]],
+  ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#ga0a5ffbf7cb58a5b6a8201114fe889360',1,'turbojpeg.h']]],
   ['tjfree',['tjFree',['../group___turbo_j_p_e_g.html#ga8c4a1231dc06a450514c835f6471f137',1,'turbojpeg.h']]],
   ['tjgeterrorstr',['tjGetErrorStr',['../group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf',1,'turbojpeg.h']]],
   ['tjgetscalingfactors',['tjGetScalingFactors',['../group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8',1,'turbojpeg.h']]],
diff --git a/doc/html/structtjregion.html b/doc/html/structtjregion.html
index 9ecd917..515686c 100644
--- a/doc/html/structtjregion.html
+++ b/doc/html/structtjregion.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/structtjscalingfactor.html b/doc/html/structtjscalingfactor.html
index 33c7366..f34e150 100644
--- a/doc/html/structtjscalingfactor.html
+++ b/doc/html/structtjscalingfactor.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/structtjtransform.html b/doc/html/structtjtransform.html
index 3199343..ef2c8d5 100644
--- a/doc/html/structtjtransform.html
+++ b/doc/html/structtjtransform.html
@@ -23,7 +23,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.2.1</span>
+   &#160;<span id="projectnumber">1.4</span>
    </div>
   </td>
  </tr>
@@ -108,7 +108,7 @@
 <tr class="memdesc:a688fe8f1a8ecc12a538d9e561cf338e3"><td class="mdescLeft">&#160;</td><td class="mdescRight">Arbitrary data that can be accessed within the body of the callback function.  <a href="#a688fe8f1a8ecc12a538d9e561cf338e3">More...</a><br/></td></tr>
 <tr class="separator:a688fe8f1a8ecc12a538d9e561cf338e3"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a43ee1bcdd2a8d7249a756774f78793c1"><td class="memItemLeft" align="right" valign="top">int(*&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structtjtransform.html#a43ee1bcdd2a8d7249a756774f78793c1">customFilter</a> )(short *coeffs, <a class="el" href="structtjregion.html">tjregion</a> arrayRegion, <a class="el" href="structtjregion.html">tjregion</a> planeRegion, int componentIndex, int transformIndex, struct <a class="el" href="structtjtransform.html">tjtransform</a> *transform)</td></tr>
-<tr class="memdesc:a43ee1bcdd2a8d7249a756774f78793c1"><td class="mdescLeft">&#160;</td><td class="mdescRight">A callback function that can be used to modify the DCT coefficients after they are losslessly transformed but before they are transcoded to a new JPEG file.  <a href="#a43ee1bcdd2a8d7249a756774f78793c1">More...</a><br/></td></tr>
+<tr class="memdesc:a43ee1bcdd2a8d7249a756774f78793c1"><td class="mdescLeft">&#160;</td><td class="mdescRight">A callback function that can be used to modify the DCT coefficients after they are losslessly transformed but before they are transcoded to a new JPEG image.  <a href="#a43ee1bcdd2a8d7249a756774f78793c1">More...</a><br/></td></tr>
 <tr class="separator:a43ee1bcdd2a8d7249a756774f78793c1"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
@@ -124,7 +124,7 @@
       </table>
 </div><div class="memdoc">
 
-<p>A callback function that can be used to modify the DCT coefficients after they are losslessly transformed but before they are transcoded to a new JPEG file. </p>
+<p>A callback function that can be used to modify the DCT coefficients after they are losslessly transformed but before they are transcoded to a new JPEG image. </p>
 <p>This allows for custom filters or other transformations to be applied in the frequency domain.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
diff --git a/doxygen.config b/doxygen.config
index b881d82..9680175 100644
--- a/doxygen.config
+++ b/doxygen.config
@@ -1,5 +1,5 @@
 PROJECT_NAME = TurboJPEG
-PROJECT_NUMBER = 1.2.1
+PROJECT_NUMBER = 1.4
 OUTPUT_DIRECTORY = doc/
 USE_WINDOWS_ENCODING = NO
 OPTIMIZE_OUTPUT_FOR_C = YES
diff --git a/java/TJBench.java b/java/TJBench.java
index eaf5fa3..5257a1b 100644
--- a/java/TJBench.java
+++ b/java/TJBench.java
@@ -36,20 +36,25 @@
 
   static final int YUVENCODE = 1;
   static final int YUVDECODE = 2;
+  static final int YUVCOMPRESS = 3;
 
-  static int flags = 0, yuv = 0, quiet = 0, pf = TJ.PF_BGR;
-  static boolean decompOnly, doTile;
+  static int flags = 0, yuv = 0, quiet = 0, pf = TJ.PF_BGR, yuvpad = 1;
+  static boolean compOnly, decompOnly, doTile;
 
   static final String[] pixFormatStr = {
     "RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "GRAY"
   };
 
   static final String[] subNameLong = {
-    "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+    "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
   };
 
   static final String[] subName = {
-    "444", "422", "420", "GRAY", "440"
+    "444", "422", "420", "GRAY", "440", "411"
+  };
+
+  static final String[] csName = {
+    "RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
   };
 
   static TJScalingFactor sf;
@@ -62,6 +67,16 @@
   }
 
 
+  static String formatName(int subsamp, int cs) {
+    if (cs == TJ.CS_YCbCr)
+      return subNameLong[subsamp];
+    else if (cs == TJ.CS_YCCK)
+      return csName[cs] + " " + subNameLong[subsamp];
+    else
+      return csName[cs];
+  }
+
+
   static String sigFig(double val, int figs) {
     String format;
     int digitsAfterDecimal = figs - (int)Math.ceil(Math.log10(Math.abs(val)));
@@ -127,9 +142,9 @@
     TJDecompressor tjd;
     double start, elapsed;
     int ps = TJ.getPixelSize(pf), i;
-    int yuvSize = TJ.bufSizeYUV(w, h, subsamp), bufsize;
-    int scaledw = (yuv == YUVDECODE) ? w : sf.getScaled(w);
-    int scaledh = (yuv == YUVDECODE) ? h : sf.getScaled(h);
+    int scaledw = sf.getScaled(w);
+    int scaledh = sf.getScaled(h);
+    int yuvSize = TJ.bufSizeYUV(scaledw, yuvpad, scaledh, subsamp), bufsize;
     int pitch = scaledw * ps;
 
     if (jpegQual > 0)
@@ -148,7 +163,7 @@
     /* Execute once to preload cache */
     tjd.setJPEGImage(jpegBuf[0], jpegSize[0]);
     if (yuv == YUVDECODE)
-      tjd.decompressToYUV(dstBuf, flags);
+      tjd.decompressToYUV(dstBuf, scaledw, yuvpad, scaledh, flags);
     else
       tjd.decompress(dstBuf, scaledw, pitch, scaledh, pf, flags);
 
@@ -157,7 +172,7 @@
          i++) {
       int tile = 0;
       if (yuv == YUVDECODE)
-        tjd.decompressToYUV(dstBuf, flags);
+        tjd.decompressToYUV(dstBuf, scaledw, yuvpad, scaledh, flags);
       else {
         for (int y = 0; y < h; y += tileh) {
           for (int x = 0; x < w; x += tilew, tile++) {
@@ -183,23 +198,24 @@
                         (double)(w * h) / 1000000. * (double)i / elapsed);
     }
 
+    if (sf.getNum() != 1 || sf.getDenom() != 1)
+      sizeStr = new String(sf.getNum() + "_" + sf.getDenom());
+    else if (tilew != w || tileh != h)
+      sizeStr = new String(tilew + "x" + tileh);
+    else
+      sizeStr = new String("full");
+    if (decompOnly)
+      tempStr = new String(fileName + "_" + sizeStr +
+                           (yuv != 0 ? ".yuv" : ".bmp"));
+    else
+      tempStr = new String(fileName + "_" + subName[subsamp] + qualStr +
+                           "_" + sizeStr + (yuv != 0 ? ".yuv" : ".bmp"));
+
     if (yuv == YUVDECODE) {
-      tempStr = fileName + "_" + subName[subsamp] + qualStr + ".yuv";
       FileOutputStream fos = new FileOutputStream(tempStr);
       fos.write(dstBuf, 0, yuvSize);
       fos.close();
     } else {
-      if (sf.getNum() != 1 || sf.getDenom() != 1)
-        sizeStr = new String(sf.getNum() + "_" + sf.getDenom());
-      else if (tilew != w || tileh != h)
-        sizeStr = new String(tilew + "x" + tileh);
-      else
-        sizeStr = new String("full");
-      if (decompOnly)
-        tempStr = new String(fileName + "_" + sizeStr + ".bmp");
-      else
-        tempStr = new String(fileName + "_" + subName[subsamp] + qualStr +
-                             "_" + sizeStr + ".bmp");
       saveImage(tempStr, dstBuf, scaledw, scaledh, pf);
       int ndx = tempStr.indexOf('.');
       tempStr = new String(tempStr.substring(0, ndx) + "-err.bmp");
@@ -305,7 +321,9 @@
     int[] jpegSize;
     double start, elapsed;
     int totalJpegSize = 0, tilew, tileh, i;
-    int ps = TJ.getPixelSize(pf), ntilesw = 1, ntilesh = 1, pitch = w * ps;
+    int ps = (yuv == YUVCOMPRESS ? 3 : TJ.getPixelSize(pf));
+    int ntilesw = 1, ntilesh = 1, pitch = w * ps;
+    String pfStr = (yuv == YUVCOMPRESS ? "YUV" : pixFormatStr[pf]);
 
     if (yuv == YUVENCODE) {
       doTestYUV(srcBuf, w, h, subsamp, fileName);
@@ -315,8 +333,7 @@
     tmpBuf = new byte[pitch * h];
 
     if (quiet == 0)
-      System.out.format(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n",
-        pixFormatStr[pf],
+      System.out.format(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pfStr,
         (flags & TJ.FLAG_BOTTOMUP) != 0 ? "Bottom-up" : "Top-down",
         subNameLong[subsamp], jpegQual);
 
@@ -336,12 +353,16 @@
 
       /* Compression test */
       if (quiet == 1)
-        System.out.format("%s\t%s\t%s\t%d\t", pixFormatStr[pf],
+        System.out.format("%s\t%s\t%s\t%d\t", pfStr,
                           (flags & TJ.FLAG_BOTTOMUP) != 0 ? "BU" : "TD",
                           subNameLong[subsamp], jpegQual);
-      for (i = 0; i < h; i++)
-        System.arraycopy(srcBuf, w * ps * i, tmpBuf, pitch * i, w * ps);
-      tjc.setSourceImage(srcBuf, tilew, pitch, tileh, pf);
+      if (yuv != YUVCOMPRESS)
+        for (i = 0; i < h; i++)
+          System.arraycopy(srcBuf, w * ps * i, tmpBuf, pitch * i, w * ps);
+      if (yuv == YUVCOMPRESS)
+        tjc.setSourceImageYUV(srcBuf, tilew, yuvpad, tileh);
+      else
+        tjc.setSourceImage(srcBuf, tilew, pitch, tileh, pf);
       tjc.setJPEGQuality(jpegQual);
       tjc.setSubsamp(subsamp);
 
@@ -357,7 +378,8 @@
           for (int x = 0; x < w; x += tilew, tile++) {
             int width = Math.min(tilew, w - x);
             int height = Math.min(tileh, h - y);
-            tjc.setSourceImage(srcBuf, x, y, width, pitch, height, pf);
+            if (yuv != YUVCOMPRESS)
+              tjc.setSourceImage(srcBuf, x, y, width, pitch, height, pf);
             tjc.compress(jpegBuf[tile], flags);
             jpegSize[tile] = tjc.getCompressedSize();
             totalJpegSize += jpegSize[tile];
@@ -398,8 +420,9 @@
       }
 
       /* Decompression test */
-      decompTest(srcBuf, jpegBuf, jpegSize, tmpBuf, w, h, subsamp, jpegQual,
-                 fileName, tilew, tileh);
+      if (!compOnly)
+        decompTest(srcBuf, jpegBuf, jpegSize, tmpBuf, w, h, subsamp, jpegQual,
+                   fileName, tilew, tileh);
 
       for (i = 0; i < ntilesw * ntilesh; i++)
         jpegBuf[i] = null;
@@ -417,7 +440,7 @@
     byte[] srcBuf;
     int[] jpegSize;
     int totalJpegSize;
-    int w = 0, h = 0, subsamp = -1, _w, _h, _tilew, _tileh,
+    int w = 0, h = 0, subsamp = -1, cs = -1, _w, _h, _tilew, _tileh,
       _ntilesw, _ntilesh, _subsamp, x, y;
     int ntilesw = 1, ntilesh = 1;
     double start, elapsed;
@@ -439,17 +462,22 @@
     w = tjt.getWidth();
     h = tjt.getHeight();
     subsamp = tjt.getSubsamp();
+    cs = tjt.getColorspace();
 
     if (quiet == 1) {
       System.out.println("All performance values in Mpixels/sec\n");
-      System.out.format("Bitmap\tBitmap\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
+      System.out.format("Bitmap\tBitmap\tJPEG\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
                         (doTile ? "Tile " : "Image"),
                         (doTile ? "Tile " : "Image"));
-      System.out.println("Format\tOrder\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n");
+      System.out.println("Format\tOrder\tCS\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n");
     } else if (quiet == 0) {
-      System.out.format(">>>>>  JPEG %s --> %s (%s)  <<<<<",
-        subNameLong[subsamp], pixFormatStr[pf],
-        (flags & TJ.FLAG_BOTTOMUP) != 0 ? "Bottom-up" : "Top-down");
+      if (yuv == YUVDECODE)
+        System.out.format(">>>>>  JPEG %s --> YUV  <<<<<",
+          formatName(subsamp, cs));
+      else
+        System.out.format(">>>>>  JPEG %s --> %s (%s)  <<<<<",
+          formatName(subsamp, cs), pixFormatStr[pf],
+          (flags & TJ.FLAG_BOTTOMUP) != 0 ? "Bottom-up" : "Top-down");
     }
 
     for (int tilew = doTile ? 16 : w, tileh = doTile ? 16 : h; ;
@@ -470,9 +498,9 @@
                             sf.getScaled(_h));
         System.out.println("");
       } else if (quiet == 1) {
-        System.out.format("%s\t%s\t%s\t", pixFormatStr[pf],
+        System.out.format("%s\t%s\t%s\t%s\t", pixFormatStr[pf],
                           (flags & TJ.FLAG_BOTTOMUP) != 0 ? "BU" : "TD",
-                          subNameLong[subsamp]);
+                          csName[cs], subNameLong[subsamp]);
         System.out.format("%-4d  %-4d\t", tilew, tileh);
       }
 
@@ -582,7 +610,7 @@
     String className = new TJBench().getClass().getName();
 
     System.out.println("\nUSAGE: java " + className);
-    System.out.println("       <Inputfile (BMP)> <Quality> [options]\n");
+    System.out.println("       <Inputfile (BMP|YUV)> <Quality> [options]\n");
     System.out.println("       java " + className);
     System.out.println("       <Inputfile (JPG)> [options]\n");
     System.out.println("Options:\n");
@@ -600,10 +628,21 @@
     System.out.println("     codec");
     System.out.println("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the");
     System.out.println("     underlying codec");
-    System.out.println("-440 = Test 4:4:0 chrominance subsampling instead of 4:2:2");
+    System.out.println("-subsamp <s> = if compressing a JPEG image from a YUV planar source image,");
+    System.out.println("     this specifies the level of chrominance subsampling used in the source");
+    System.out.println("     image.  Otherwise, this specifies the level of chrominance subsampling");
+    System.out.println("     to use in the JPEG destination image.  <s> = 444, 422, 440, 420, 411,");
+    System.out.println("     or GRAY");
     System.out.println("-quiet = Output results in tabular rather than verbose format");
     System.out.println("-yuvencode = Encode RGB input as planar YUV rather than compressing as JPEG");
     System.out.println("-yuvdecode = Decode JPEG image to planar YUV rather than RGB");
+    System.out.println("-yuvsize WxH = if compressing a JPEG image from a YUV planar source image, this");
+    System.out.println("     specifies the width and height of the source image.");
+    System.out.println("-yuvpad <p> = if compressing a JPEG image from a YUV planar source image, this");
+    System.out.println("     specifies the number of bytes to which each row of each plane in the");
+    System.out.println("     source image is padded.  If decompressing a JPEG image to a YUV planar");
+    System.out.println("     destination image, this specifies the row padding for each plane of the");
+    System.out.println("     destination image. (default=1)");
     System.out.println("-scale M/N = scale down the width/height of the decompressed JPEG image by a");
     System.out.print  ("     factor of M/N (M/N = ");
     for (i = 0; i < nsf; i++) {
@@ -626,7 +665,8 @@
     System.out.println("     decompression (these options are mutually exclusive)");
     System.out.println("-grayscale = Perform lossless grayscale conversion prior to decompression");
     System.out.println("     test (can be combined with the other transforms above)");
-    System.out.println("-benchTime <t> = Run each benchmark for at least <t> seconds (default = 5.0)\n");
+    System.out.println("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)");
+	System.out.println("-componly = Stop after running compression tests.  Do not test decompression.\n");
     System.out.println("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate");
     System.out.println("test will be performed for all quality values in the range.\n");
     System.exit(1);
@@ -637,7 +677,7 @@
     byte[] srcBuf = null;  int w = 0, h = 0;
     int minQual = -1, maxQual = -1;
     int minArg = 1;  int retval = 0;
-    boolean do440 = false;
+    int subsamp = -1;
 
     try {
 
@@ -647,6 +687,8 @@
       String tempStr = argv[0].toLowerCase();
       if (tempStr.endsWith(".jpg") || tempStr.endsWith(".jpeg"))
         decompOnly = true;
+      if (tempStr.endsWith(".yuv"))
+        yuv = YUVCOMPRESS;
 
       System.out.println("");
 
@@ -715,8 +757,6 @@
             System.out.println("Using most accurate DCT/IDCT algorithm\n");
             flags |= TJ.FLAG_ACCURATEDCT;
           }
-          if (argv[i].equals("-440"))
-            do440 = true;
           if (argv[i].equalsIgnoreCase("-rgb"))
             pf = TJ.PF_RGB;
           if (argv[i].equalsIgnoreCase("-rgbx"))
@@ -786,6 +826,44 @@
             else
               usage();
           }
+          if (argv[i].equalsIgnoreCase("-yuvsize") && i < argv.length - 1) {
+            int temp1 = 0, temp2 = 0;
+            Scanner scanner = new Scanner(argv[++i]).useDelimiter("x");
+            try {
+              temp1 = scanner.nextInt();
+              temp2 = scanner.nextInt();
+            } catch(Exception e) {}
+            if (temp1 >= 1 && temp2 >= 1) {
+              w = temp1;
+              h = temp2;
+            } else
+              usage();
+          }
+          if (argv[i].equalsIgnoreCase("-yuvpad") && i < argv.length - 1) {
+            int temp = 0;
+            try {
+             temp = Integer.parseInt(argv[++i]);
+            } catch (NumberFormatException e) {}
+            if (temp >= 1)
+              yuvpad = temp;
+          }
+          if (argv[i].equalsIgnoreCase("-subsamp") && i < argv.length - 1) {
+            i++;
+            if (argv[i].toUpperCase().startsWith("G"))
+              subsamp = TJ.SAMP_GRAY;
+            else if (argv[i].equals("444"))
+              subsamp = TJ.SAMP_444;
+            else if (argv[i].equals("422"))
+              subsamp = TJ.SAMP_422;
+            else if (argv[i].equals("440"))
+              subsamp = TJ.SAMP_440;
+            else if (argv[i].equals("420"))
+              subsamp = TJ.SAMP_420;
+            else if (argv[i].equals("411"))
+              subsamp = TJ.SAMP_411;
+          }
+          if (argv[i].equalsIgnoreCase("-componly"))
+            compOnly = true;
           if (argv[i].equalsIgnoreCase("-?"))
             usage();
         }
@@ -802,17 +880,30 @@
 
       if (yuv != 0 && doTile) {
         System.out.println("Disabling tiled compression/decompression tests, because those tests do not");
-        System.out.println("work when YUV encoding or decoding is enabled.\n");
+        System.out.println("work when YUV encoding, compression, or decoding is enabled.\n");
         doTile = false;
       }
 
       if (!decompOnly) {
-        int[] width = new int[1], height = new int[1];
-        srcBuf = loadImage(argv[0], width, height, pf);
-        w = width[0];  h = height[0];
-        int index = -1;
-        if ((index = argv[0].indexOf('.')) >= 0)
-          argv[0] = argv[0].substring(0, index);
+        if(yuv == YUVCOMPRESS) {
+          if (w < 1 || h < 1 || subsamp < 0 || subsamp >= TJ.NUMSAMP)
+            throw new Exception("YUV image size and/or subsampling not specified");
+          FileInputStream fis = new FileInputStream(argv[0]);
+          int srcSize = (int)fis.getChannel().size();
+          if (srcSize != TJ.bufSizeYUV(w, yuvpad, h, subsamp))
+            throw new Exception("YUV image file is the wrong size");
+          srcBuf = new byte[srcSize];
+          fis.read(srcBuf, 0, srcSize);
+          fis.close();
+		}
+        else {
+          int[] width = new int[1], height = new int[1];
+          srcBuf = loadImage(argv[0], width, height, pf);
+          w = width[0];  h = height[0];
+          int index = -1;
+          if ((index = argv[0].indexOf('.')) >= 0)
+            argv[0] = argv[0].substring(0, index);
+        }
       }
 
       if (quiet == 1 && !decompOnly) {
@@ -829,21 +920,27 @@
       }
 
       System.gc();
-      for (int i = maxQual; i >= minQual; i--)
-        doTest(srcBuf, w, h, TJ.SAMP_GRAY, i, argv[0]);
-      System.out.println("");
-      System.gc();
-      for (int i = maxQual; i >= minQual; i--)
-        doTest(srcBuf, w, h, TJ.SAMP_420, i, argv[0]);
-      System.out.println("");
-      System.gc();
-      for (int i = maxQual; i >= minQual; i--)
-        doTest(srcBuf, w, h, do440 ? TJ.SAMP_440 : TJ.SAMP_422, i, argv[0]);
-      System.out.println("");
-      System.gc();
-      for (int i = maxQual; i >= minQual; i--)
-        doTest(srcBuf, w, h, TJ.SAMP_444, i, argv[0]);
-      System.out.println("");
+      if (yuv == YUVCOMPRESS || (subsamp >= 0 && subsamp < TJ.NUMSAMP)) {
+        for (int i = maxQual; i >= minQual; i--)
+          doTest(srcBuf, w, h, subsamp, i, argv[0]);
+        System.out.println("");
+      } else {
+        for (int i = maxQual; i >= minQual; i--)
+          doTest(srcBuf, w, h, TJ.SAMP_GRAY, i, argv[0]);
+        System.out.println("");
+        System.gc();
+        for (int i = maxQual; i >= minQual; i--)
+          doTest(srcBuf, w, h, TJ.SAMP_420, i, argv[0]);
+        System.out.println("");
+        System.gc();
+        for (int i = maxQual; i >= minQual; i--)
+          doTest(srcBuf, w, h, TJ.SAMP_422, i, argv[0]);
+        System.out.println("");
+        System.gc();
+        for (int i = maxQual; i >= minQual; i--)
+          doTest(srcBuf, w, h, TJ.SAMP_444, i, argv[0]);
+        System.out.println("");
+      }
 
     } catch (Exception e) {
       System.out.println("ERROR: " + e.getMessage());
diff --git a/java/TJUnitTest.java b/java/TJUnitTest.java
index ffe82e9..a6ee73e 100644
--- a/java/TJUnitTest.java
+++ b/java/TJUnitTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2012, 2014 D. R. Commander.  
+ * Copyright (C)2011-2014 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,24 +46,26 @@
     System.out.println("\nUSAGE: java " + classname + " [options]\n");
     System.out.println("Options:\n");
     System.out.println("-yuv = test YUV encoding/decoding support\n");
+    System.out.println("-noyuvpad = do not pad each line of each Y, U, and V plane to the nearest\n");
+    System.out.println("            4-byte boundary\n");
     System.out.println("-bi = test BufferedImage support\n");
     System.exit(1);
   }
 
   private static final String[] subNameLong = {
-    "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+    "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
   };
   private static final String[] subName = {
-    "444", "422", "420", "GRAY", "440"
+    "444", "422", "420", "GRAY", "440", "411"
   };
 
   private static final String[] pixFormatStr = {
     "RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "Grayscale",
-    "RGBA", "BGRA", "ABGR", "ARGB"
+    "RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
   };
 
   private static final int[] alphaOffset = {
-    -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0
+    -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
   };
 
   private static final int[] _3byteFormats = {
@@ -73,7 +75,7 @@
     BufferedImage.TYPE_3BYTE_BGR
   };
   private static final int[] _4byteFormats = {
-    TJ.PF_RGBX, TJ.PF_BGRX, TJ.PF_XBGR, TJ.PF_XRGB
+    TJ.PF_RGBX, TJ.PF_BGRX, TJ.PF_XBGR, TJ.PF_XRGB, TJ.PF_CMYK
   };
   private static final int[] _4byteFormatsBI = {
     BufferedImage.TYPE_INT_BGR, BufferedImage.TYPE_INT_RGB,
@@ -93,6 +95,7 @@
   private static final int YUVENCODE = 1;
   private static final int YUVDECODE = 2;
   private static int yuv = 0;
+  private static int pad = 4;
   private static boolean bi = false;
 
   private static int exitStatus = 0;
@@ -162,8 +165,8 @@
     int ps = TJ.getPixelSize(pf);
     int index, row, col, halfway = 16;
 
-    Arrays.fill(buf, (byte)0);
     if (pf == TJ.PF_GRAY) {
+      Arrays.fill(buf, (byte)0);
       for (row = 0; row < h; row++) {
         for (col = 0; col < w; col++) {
           if ((flags & TJ.FLAG_BOTTOMUP) != 0)
@@ -178,6 +181,27 @@
       }
       return;
     }
+    if (pf == TJ.PF_CMYK) {
+      Arrays.fill(buf, (byte)255);
+      for (row = 0; row < h; row++) {
+        for (col = 0; col < w; col++) {
+          if ((flags & TJ.FLAG_BOTTOMUP) != 0)
+            index = (h - row - 1) * w + col;
+          else
+            index = row * w + col;
+          if (((row / 8) + (col / 8)) % 2 == 0) {
+            if (row >= halfway) buf[index * ps + 3] = 0;
+          } else {
+            buf[index * ps + 2] = 0;
+            if (row < halfway)
+              buf[index * ps + 1] = 0;
+          }
+        }
+      }
+      return;
+    }
+
+    Arrays.fill(buf, (byte)0);
     for (row = 0; row < h; row++) {
       for (col = 0; col < w; col++) {
         if ((flags & TJ.FLAG_BOTTOMUP) != 0)
@@ -296,6 +320,39 @@
     int blockSize = 8 * sf.getNum() / sf.getDenom();
 
     try {
+
+      if (pf == TJ.PF_CMYK) {
+        for (row = 0; row < h; row++) {
+          for (col = 0; col < w; col++) {
+            if ((flags & TJ.FLAG_BOTTOMUP) != 0)
+              index = (h - row - 1) * w + col;
+            else
+              index = row * w + col;
+            byte c = buf[index * ps];
+            byte m = buf[index * ps + 1];
+            byte y = buf[index * ps + 2]; 
+            byte k = buf[index * ps + 3];
+            checkVal255(row, col, c, "C");
+            if (((row / blockSize) + (col / blockSize)) % 2 == 0) {
+              checkVal255(row, col, m, "M");
+              checkVal255(row, col, y, "Y");
+              if (row < halfway)
+                checkVal255(row, col, k, "K");
+              else
+                checkVal0(row, col, k, "K");
+            } else {
+              checkVal0(row, col, y, "Y");
+              checkVal255(row, col, k, "K");
+              if (row < halfway)
+                checkVal0(row, col, m, "M");
+              else
+                checkVal255(row, col, m, "M");
+            }
+          }
+        }
+        return 1;
+      }
+
       for (row = 0; row < halfway; row++) {
         for (col = 0; col < w; col++) {
           if ((flags & TJ.FLAG_BOTTOMUP) != 0)
@@ -348,13 +405,25 @@
     if (retval == 0) {
       for (row = 0; row < h; row++) {
         for (col = 0; col < w; col++) {
-          int r = buf[pitch * row + col * ps + roffset];
-          int g = buf[pitch * row + col * ps + goffset];
-          int b = buf[pitch * row + col * ps + boffset];
-          if (r < 0) r += 256;
-          if (g < 0) g += 256;
-          if (b < 0) b += 256;
-          System.out.format("%3d/%3d/%3d ", r, g, b);
+          if (pf == TJ.PF_CMYK) {
+            int c = buf[pitch * row + col * ps];
+            int m = buf[pitch * row + col * ps + 1];
+            int y = buf[pitch * row + col * ps + 2];
+            int k = buf[pitch * row + col * ps + 3];
+            if (c < 0) c += 256;
+            if (m < 0) m += 256;
+            if (y < 0) y += 256;
+            if (k < 0) k += 256;
+            System.out.format("%3d/%3d/%3d/%3d ", c, m, y, k);
+          } else {
+            int r = buf[pitch * row + col * ps + roffset];
+            int g = buf[pitch * row + col * ps + goffset];
+            int b = buf[pitch * row + col * ps + boffset];
+            if (r < 0) r += 256;
+            if (g < 0) g += 256;
+            if (b < 0) b += 256;
+            System.out.format("%3d/%3d/%3d ", r, g, b);
+          }
         }
         System.out.print("\n");
       }
@@ -469,17 +538,67 @@
     return ((v + (p) - 1) & (~((p) - 1)));
   }
 
-  private static int checkBufYUV(byte[] buf, int size, int w, int h,
+  private static void initBufYUV(byte[] buf, int w, int pad, int h,
                                  int subsamp) throws Exception {
     int row, col;
     int hsf = TJ.getMCUWidth(subsamp) / 8, vsf = TJ.getMCUHeight(subsamp) / 8;
     int pw = PAD(w, hsf), ph = PAD(h, vsf);
     int cw = pw / hsf, ch = ph / vsf;
-    int ypitch = PAD(pw, 4), uvpitch = PAD(cw, 4);
+    int ypitch = PAD(pw, pad), uvpitch = PAD(cw, pad);
+    int halfway = 16, blockSize = 8;
+
+    Arrays.fill(buf, (byte)0);
+    for (row = 0; row < ph; row++) {
+      for (col = 0; col < pw; col++) {
+        int index = ypitch * row + col;
+        if (((row / blockSize) + (col / blockSize)) % 2 == 0) {
+          if (row < halfway)
+            buf[index] = (byte)255;
+          else
+            buf[index] = 0;
+        } else {
+          if (row < halfway)
+            buf[index] = 76;
+          else
+            buf[index] = (byte)226;
+        }
+      }
+    }
+    if (subsamp != TJ.SAMP_GRAY) {
+      halfway = 16 / vsf;
+      for (row = 0; row < ch; row++) {
+        for (col = 0; col < cw; col++) {
+          int uindex = ypitch * ph + (uvpitch * row + col),
+            vindex = ypitch * ph + uvpitch * ch + (uvpitch * row + col);
+          if (((row * vsf / blockSize) + (col * hsf / blockSize)) % 2 == 0) {
+            buf[uindex] = buf[vindex] = (byte)128;
+          } else {
+            if (row < halfway) {
+              buf[uindex] = 85;
+              buf[vindex] = (byte)255;
+            } else {
+              buf[uindex] = 0;
+              buf[vindex] = (byte)149;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private static int checkBufYUV(byte[] buf, int size, int w, int h,
+                                 int subsamp, TJScalingFactor sf)
+                                 throws Exception {
+    int row, col;
+    int hsf = TJ.getMCUWidth(subsamp) / 8, vsf = TJ.getMCUHeight(subsamp) / 8;
+    int pw = PAD(w, hsf), ph = PAD(h, vsf);
+    int cw = pw / hsf, ch = ph / vsf;
+    int ypitch = PAD(pw, pad), uvpitch = PAD(cw, pad);
     int retval = 1;
     int correctsize = ypitch * ph +
                       (subsamp == TJ.SAMP_GRAY ? 0 : uvpitch * ch * 2);
-    int halfway = 16;
+    int halfway = 16 * sf.getNum() / sf.getDenom();
+    int blockSize = 8 * sf.getNum() / sf.getDenom();
 
     try {
       if (size != correctsize)
@@ -489,7 +608,7 @@
       for (row = 0; row < ph; row++) {
         for (col = 0; col < pw; col++) {
           byte y = buf[ypitch * row + col];
-          if (((row / 8) + (col / 8)) % 2 == 0) {
+          if (((row / blockSize) + (col / blockSize)) % 2 == 0) {
             if (row < halfway)
               checkVal255(row, col, y, "Y");
             else
@@ -503,12 +622,12 @@
         }
       }
       if (subsamp != TJ.SAMP_GRAY) {
-        halfway = 16 / vsf;
+        halfway = 16 / vsf * sf.getNum() / sf.getDenom();
         for (row = 0; row < ch; row++) {
           for (col = 0; col < cw; col++) {
             byte u = buf[ypitch * ph + (uvpitch * row + col)],
                  v = buf[ypitch * ph + uvpitch * ch + (uvpitch * row + col)];
-            if (((row * vsf / 8) + (col * hsf / 8)) % 2 == 0) {
+            if (((row * vsf / blockSize) + (col * hsf / blockSize)) % 2 == 0) {
               checkVal(row, col, u, "U", 128);
               checkVal(row, col, v, "V", 128);
             } else {
@@ -575,53 +694,64 @@
     byte[] srcBuf = null;
     BufferedImage img = null;
     String pfStr;
+    String buStrLong = (flags & TJ.FLAG_BOTTOMUP) != 0 ?
+                       "Bottom-Up" : "Top-Down ";
+    String buStr = (flags & TJ.FLAG_BOTTOMUP) != 0 ? "BU" : "TD";
     double t;
     int size = 0, ps, imgType = pf;
 
-    if (bi) {
-      pf = biTypePF(imgType);
-      pfStr = biTypeStr(imgType);
-    } else
-      pfStr = pixFormatStr[pf];
-    ps =  TJ.getPixelSize(pf);
-
-    System.out.print(pfStr + " ");
-    if (bi)
-      System.out.print("(" + pixFormatStr[pf] + ") ");
-    if ((flags & TJ.FLAG_BOTTOMUP) != 0)
-      System.out.print("Bottom-Up");
-    else
-      System.out.print("Top-Down ");
-    System.out.print(" -> " + subNameLong[subsamp] + " ");
-    if (yuv == YUVENCODE)
-      System.out.print("YUV ... ");
-    else
-      System.out.print("Q" + jpegQual + " ... ");
-
-    if (bi) {
-      img = new BufferedImage(w, h, imgType);
-      initImg(img, pf, flags);
-      tempstr = baseName + "_enc_" + pfStr + "_" +
-                (((flags & TJ.FLAG_BOTTOMUP) != 0) ? "BU" : "TD") + "_" +
-                subName[subsamp] + "_Q" + jpegQual + ".png";
-      File file = new File(tempstr);
-      ImageIO.write(img, "png", file);
+    if (yuv == YUVDECODE) {
+      System.out.format("YUV %s %s --> JPEG Q%d ... ", subNameLong[subsamp],
+                        buStrLong, jpegQual);
+      srcBuf = new byte[TJ.bufSizeYUV(w, pad, h, subsamp)];
+      initBufYUV(srcBuf, w, pad, h, subsamp);
+      pfStr = "YUV";
     } else {
-      srcBuf = new byte[w * h * ps + 1];
-      initBuf(srcBuf, w, w * ps, h, pf, flags);
+      if (bi) {
+        pf = biTypePF(imgType);
+        pfStr = biTypeStr(imgType);
+      } else
+        pfStr = pixFormatStr[pf];
+      ps =  TJ.getPixelSize(pf);
+
+      System.out.print(pfStr + " ");
+      if (bi)
+        System.out.print("(" + pixFormatStr[pf] + ") ");
+      if (yuv == YUVENCODE)
+        System.out.format("%s -> %s YUV ... ", buStrLong,
+                          subNameLong[subsamp]);
+      else
+        System.out.format("%s -> %s Q%d ... ", buStrLong, subNameLong[subsamp],
+                          jpegQual);
+
+      if (bi) {
+        img = new BufferedImage(w, h, imgType);
+        initImg(img, pf, flags);
+        tempstr = baseName + "_enc_" + pfStr + "_" + buStr + "_" +
+                  subName[subsamp] + "_Q" + jpegQual + ".png";
+        File file = new File(tempstr);
+        ImageIO.write(img, "png", file);
+      } else {
+        srcBuf = new byte[w * h * ps + 1];
+        initBuf(srcBuf, w, w * ps, h, pf, flags);
+      }
     }
     Arrays.fill(dstBuf, (byte)0);
 
     t = getTime();
     tjc.setSubsamp(subsamp);
     tjc.setJPEGQuality(jpegQual);
+    tjc.setYUVPad(pad);
     if (bi) {
       if (yuv == YUVENCODE)
         tjc.encodeYUV(img, dstBuf, flags);
       else
         tjc.compress(img, dstBuf, flags);
     } else {
-      tjc.setSourceImage(srcBuf, w, 0, h, pf);
+      if (yuv == YUVDECODE)
+        tjc.setSourceImageYUV(srcBuf, w, pad, h);
+      else
+        tjc.setSourceImage(srcBuf, w, 0, h, pf);
       if (yuv == YUVENCODE)
         tjc.encodeYUV(dstBuf, flags);
       else
@@ -631,17 +761,16 @@
     t = getTime() - t;
 
     if (yuv == YUVENCODE)
-      tempstr = baseName + "_enc_" + pfStr + "_" +
-                (((flags & TJ.FLAG_BOTTOMUP) != 0) ? "BU" : "TD") + "_" +
+      tempstr = baseName + "_enc_" + pfStr + "_" + buStr + "_" +
                 subName[subsamp] + ".yuv";
     else
-      tempstr = baseName + "_enc_" + pfStr + "_" +
-                (((flags & TJ.FLAG_BOTTOMUP) != 0) ? "BU" : "TD") + "_" +
+      tempstr = baseName + "_enc_" + pfStr + "_" + buStr + "_" +
                 subName[subsamp] + "_Q" + jpegQual + ".jpg";
     writeJPEG(dstBuf, size, tempstr);
 
     if (yuv == YUVENCODE) {
-      if (checkBufYUV(dstBuf, size, w, h, subsamp) == 1)
+      if (checkBufYUV(dstBuf, size, w, h, subsamp,
+                      new TJScalingFactor(1, 1)) == 1)
         System.out.print("Passed.");
       else {
         System.out.print("FAILED!");
@@ -677,7 +806,7 @@
 
     System.out.print("JPEG -> ");
     if (yuv == YUVDECODE)
-      System.out.print("YUV " + subNameLong[subsamp] + " ... ");
+      System.out.print("YUV " + subNameLong[subsamp] + " ");
     else {
       System.out.print(pfStr + " ");
       if (bi)
@@ -686,11 +815,11 @@
         System.out.print("Bottom-Up ");
       else
         System.out.print("Top-Down  ");
-      if (!sf.isOne())
-        System.out.print(sf.getNum() + "/" + sf.getDenom() + " ... ");
-      else
-        System.out.print("... ");
     }
+    if (!sf.isOne())
+      System.out.print(sf.getNum() + "/" + sf.getDenom() + " ... ");
+    else
+      System.out.print("... ");
 
     t = getTime();
     tjd.setJPEGImage(jpegBuf, jpegSize);
@@ -706,7 +835,7 @@
       throw new Exception("Scaled size mismatch");
 
     if (yuv == YUVDECODE)
-      dstBuf = tjd.decompressToYUV(flags);
+      dstBuf = tjd.decompressToYUV(scaledWidth, pad, scaledHeight, flags);
     else {
       if (bi)
         img = tjd.decompress(scaledWidth, scaledHeight, imgType, flags);
@@ -725,7 +854,8 @@
     }
 
     if (yuv == YUVDECODE) {
-      if (checkBufYUV(dstBuf, dstBuf.length, w, h, subsamp) == 1)
+      if (checkBufYUV(dstBuf, dstBuf.length, scaledWidth, scaledHeight,
+                      subsamp, sf) == 1)
         System.out.print("Passed.");
       else {
         System.out.print("FAILED!");  exitStatus = -1;
@@ -749,14 +879,18 @@
                                  String baseName, int subsamp,
                                  int flags) throws Exception {
     int i;
-    if ((subsamp == TJ.SAMP_444 || subsamp == TJ.SAMP_GRAY) && yuv == 0) {
-      TJScalingFactor[] sf = TJ.getScalingFactors();
-      for (i = 0; i < sf.length; i++)
+    TJScalingFactor[] sf = TJ.getScalingFactors();
+    for (i = 0; i < sf.length; i++) {
+      int num = sf[i].getNum();
+      int denom = sf[i].getDenom();
+      if (subsamp == TJ.SAMP_444 || subsamp == TJ.SAMP_GRAY ||
+          (subsamp == TJ.SAMP_411 && num == 1 &&
+           (denom == 2 || denom == 1)) ||
+          (subsamp != TJ.SAMP_411 && num == 1 &&
+           (denom == 4 || denom == 2 || denom == 1)))
         decompTest(tjd, jpegBuf, jpegSize, w, h, pf, baseName, subsamp,
                    flags, sf[i]);
-    } else
-      decompTest(tjd, jpegBuf, jpegSize, w, h, pf, baseName, subsamp,
-                 flags, new TJScalingFactor(1, 1));
+    }
   }
 
   private static void doTest(int w, int h, int[] formats, int subsamp,
@@ -767,7 +901,7 @@
     byte[] dstBuf;
 
     if (yuv == YUVENCODE)
-      dstBuf = new byte[TJ.bufSizeYUV(w, h, subsamp)];
+      dstBuf = new byte[TJ.bufSizeYUV(w, pad, h, subsamp)];
     else
       dstBuf = new byte[TJ.bufSize(w, h, subsamp)];
 
@@ -776,10 +910,11 @@
       tjd = new TJDecompressor();
 
       for (int pf : formats) {
+        if (pf < 0) continue;
         for (int i = 0; i < 2; i++) {
           int flags = 0;
           if (subsamp == TJ.SAMP_422 || subsamp == TJ.SAMP_420 ||
-              subsamp == TJ.SAMP_440)
+              subsamp == TJ.SAMP_440 || subsamp == TJ.SAMP_411)
             flags |= TJ.FLAG_FASTUPSAMPLE;
           if (i == 1) {
             if (yuv == YUVDECODE) {
@@ -825,7 +960,7 @@
               System.out.format("%04d x %04d\b\b\b\b\b\b\b\b\b\b\b", w, h);
             srcBuf = new byte[w * h * 4];
             if (yuv == YUVENCODE)
-              dstBuf = new byte[TJ.bufSizeYUV(w, h, subsamp)];
+              dstBuf = new byte[TJ.bufSizeYUV(w, pad, h, subsamp)];
             else
               dstBuf = new byte[TJ.bufSize(w, h, subsamp)];
             for (i = 0; i < w * h * 4; i++) {
@@ -834,6 +969,7 @@
             tjc.setSourceImage(srcBuf, w, 0, h, TJ.PF_BGRX);
             tjc.setSubsamp(subsamp);
             tjc.setJPEGQuality(100);
+            tjc.setYUVPad(pad);
             if (yuv == YUVENCODE)
               tjc.encodeYUV(dstBuf, 0);
             else
@@ -841,7 +977,7 @@
 
             srcBuf = new byte[h * w * 4];
             if (yuv == YUVENCODE)
-              dstBuf = new byte[TJ.bufSizeYUV(h, w, subsamp)];
+              dstBuf = new byte[TJ.bufSizeYUV(h, pad, w, subsamp)];
             else
               dstBuf = new byte[TJ.bufSize(h, w, subsamp)];
             for (i = 0; i < h * w * 4; i++) {
@@ -870,6 +1006,8 @@
       for (int i = 0; i < argv.length; i++) {
         if (argv[i].equalsIgnoreCase("-yuv"))
           doyuv = true;
+        if (argv[i].equalsIgnoreCase("-noyuvpad"))
+          pad = 1;
         if (argv[i].substring(0, 1).equalsIgnoreCase("-h") ||
             argv[i].equalsIgnoreCase("-?"))
           usage();
@@ -878,7 +1016,10 @@
           testName = "javabitest";
         }
       }
-      if (doyuv) yuv = YUVENCODE;
+      if (doyuv) {
+        yuv = YUVENCODE;
+        _4byteFormats[4] = -1;
+      }
       doTest(35, 39, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_444,
              testName);
       doTest(39, 41, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_444,
@@ -895,10 +1036,15 @@
              testName);
       doTest(39, 41, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_440,
              testName);
-      doTest(35, 39, bi ? onlyGrayBI : onlyGray, TJ.SAMP_GRAY, testName);
-      doTest(39, 41, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_GRAY,
+      doTest(41, 35, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_411,
              testName);
-      doTest(41, 35, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_GRAY,
+      doTest(35, 39, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_411,
+             testName);
+      doTest(39, 41, bi ? onlyGrayBI : onlyGray, TJ.SAMP_GRAY, testName);
+      doTest(41, 35, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_GRAY,
+             testName);
+      _4byteFormats[4] = -1;
+      doTest(35, 39, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_GRAY,
              testName);
       if (!bi)
         bufSizeTest();
@@ -913,10 +1059,12 @@
         doTest(41, 35, onlyRGB, TJ.SAMP_420, "javatest_yuv1");
         doTest(48, 48, onlyRGB, TJ.SAMP_440, "javatest_yuv0");
         doTest(35, 39, onlyRGB, TJ.SAMP_440, "javatest_yuv1");
+        doTest(48, 48, onlyRGB, TJ.SAMP_411, "javatest_yuv0");
+        doTest(39, 41, onlyRGB, TJ.SAMP_411, "javatest_yuv1");
         doTest(48, 48, onlyRGB, TJ.SAMP_GRAY, "javatest_yuv0");
-        doTest(35, 39, onlyRGB, TJ.SAMP_GRAY, "javatest_yuv1");
+        doTest(41, 35, onlyRGB, TJ.SAMP_GRAY, "javatest_yuv1");
         doTest(48, 48, onlyGray, TJ.SAMP_GRAY, "javatest_yuv0");
-        doTest(39, 41, onlyGray, TJ.SAMP_GRAY, "javatest_yuv1");
+        doTest(35, 39, onlyGray, TJ.SAMP_GRAY, "javatest_yuv1");
       }
     } catch(Exception e) {
       e.printStackTrace();
diff --git a/java/doc/constant-values.html b/java/doc/constant-values.html
index e4adb67..01f950f 100644
--- a/java/doc/constant-values.html
+++ b/java/doc/constant-values.html
@@ -99,6 +99,36 @@
 <TH ALIGN="left" COLSPAN="3">org.libjpegturbo.turbojpeg.<A HREF="org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A></TH>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_CMYK"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_CMYK">CS_CMYK</A></CODE></TD>
+<TD ALIGN="right"><CODE>3</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_GRAY"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_GRAY">CS_GRAY</A></CODE></TD>
+<TD ALIGN="right"><CODE>2</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_RGB"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_RGB">CS_RGB</A></CODE></TD>
+<TD ALIGN="right"><CODE>0</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_YCbCr"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr">CS_YCbCr</A></CODE></TD>
+<TD ALIGN="right"><CODE>1</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.CS_YCCK"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK">CS_YCCK</A></CODE></TD>
+<TD ALIGN="right"><CODE>4</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.FLAG_ACCURATEDCT"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#FLAG_ACCURATEDCT">FLAG_ACCURATEDCT</A></CODE></TD>
@@ -147,16 +177,22 @@
 <TD ALIGN="right"><CODE>128</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.NUMCS"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#NUMCS">NUMCS</A></CODE></TD>
+<TD ALIGN="right"><CODE>5</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.NUMPF"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#NUMPF">NUMPF</A></CODE></TD>
-<TD ALIGN="right"><CODE>11</CODE></TD>
+<TD ALIGN="right"><CODE>12</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.NUMSAMP"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#NUMSAMP">NUMSAMP</A></CODE></TD>
-<TD ALIGN="right"><CODE>5</CODE></TD>
+<TD ALIGN="right"><CODE>6</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.PF_ABGR"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
@@ -189,6 +225,12 @@
 <TD ALIGN="right"><CODE>3</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.PF_CMYK"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#PF_CMYK">PF_CMYK</A></CODE></TD>
+<TD ALIGN="right"><CODE>11</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.PF_GRAY"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#PF_GRAY">PF_GRAY</A></CODE></TD>
@@ -225,6 +267,12 @@
 <TD ALIGN="right"><CODE>5</CODE></TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<A NAME="org.libjpegturbo.turbojpeg.TJ.SAMP_411"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
+<CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
+<TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#SAMP_411">SAMP_411</A></CODE></TD>
+<TD ALIGN="right"><CODE>5</CODE></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <A NAME="org.libjpegturbo.turbojpeg.TJ.SAMP_420"><!-- --></A><TD ALIGN="right"><FONT SIZE="-1">
 <CODE>public&nbsp;static&nbsp;final&nbsp;int</CODE></FONT></TD>
 <TD ALIGN="left"><CODE><A HREF="org/libjpegturbo/turbojpeg/TJ.html#SAMP_420">SAMP_420</A></CODE></TD>
diff --git a/java/doc/deprecated-list.html b/java/doc/deprecated-list.html
index 37ca515..9f3b4fd 100644
--- a/java/doc/deprecated-list.html
+++ b/java/doc/deprecated-list.html
@@ -81,9 +81,39 @@
 </CENTER>
 <HR SIZE="4" NOSHADE>
 <B>Contents</B><UL>
+<LI><A HREF="#field">Deprecated Fields</A>
 <LI><A HREF="#method">Deprecated Methods</A>
 </UL>
 
+<A NAME="field"><!-- --></A>
+<TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
+<TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor">
+<TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2">
+<B>Deprecated Fields</B></FONT></TH>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCEMMX">org.libjpegturbo.turbojpeg.TJ.FLAG_FORCEMMX</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE">org.libjpegturbo.turbojpeg.TJ.FLAG_FORCESSE</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE2">org.libjpegturbo.turbojpeg.TJ.FLAG_FORCESSE2</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE3">org.libjpegturbo.turbojpeg.TJ.FLAG_FORCESSE3</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
+</TR>
+</TABLE>
+&nbsp;
+<P>
 <A NAME="method"><!-- --></A>
 <TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
 <TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor">
@@ -91,12 +121,28 @@
 <B>Deprecated Methods</B></FONT></TH>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)">org.libjpegturbo.turbojpeg.TJ.bufSizeYUV(int, int, int)</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use <A HREF="org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> instead.</I>&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <TD><A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int)">org.libjpegturbo.turbojpeg.TJDecompressor.decompress(byte[], int, int, int, int, int)</A>
 <BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use
  <A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int, int, int)"><CODE>TJDecompressor.decompress(byte[], int, int, int, int, int, int, int)</CODE></A> instead.</I>&nbsp;</TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)">org.libjpegturbo.turbojpeg.TJDecompressor.decompressToYUV(byte[], int)</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use <A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>TJDecompressor.decompressToYUV(byte[], int, int, int, int)</CODE></A>
+ instead.</I>&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD><A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)">org.libjpegturbo.turbojpeg.TJDecompressor.decompressToYUV(int)</A>
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use <A HREF="org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><CODE>TJDecompressor.decompressToYUV(int, int, int, int)</CODE></A> instead.</I>&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
 <TD><A HREF="org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[], int, int, int, int)">org.libjpegturbo.turbojpeg.TJCompressor.setSourceImage(byte[], int, int, int, int)</A>
 <BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<I>Use
diff --git a/java/doc/index-all.html b/java/doc/index-all.html
index a534d43..81d9b45 100644
--- a/java/doc/index-all.html
+++ b/java/doc/index-all.html
@@ -82,10 +82,13 @@
 Static method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>Returns the maximum size of the buffer (in bytes) required to hold a JPEG
  image with the given width, height, and level of chrominance subsampling.
-<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><B>bufSizeYUV(int, int, int)</B></A> - 
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><B>bufSizeYUV(int, int, int, int)</B></A> - 
 Static method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>Returns the size of the buffer (in bytes) required to hold a YUV planar
  image with the given width, height, and level of chrominance subsampling.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><B>bufSizeYUV(int, int, int)</B></A> - 
+Static method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="./org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> instead.</I>
 </DL>
 <HR>
 <A NAME="_C_"><!-- --></A><H2>
@@ -116,11 +119,26 @@
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
 <DD>Compress the uncompressed source image stored in <code>srcImage</code>
  and return a buffer containing a JPEG image.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_CMYK"><B>CS_CMYK</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>CMYK colorspace.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_GRAY"><B>CS_GRAY</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>Grayscale colorspace.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_RGB"><B>CS_RGB</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>RGB colorspace.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr"><B>CS_YCbCr</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>YCbCr colorspace.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK"><B>CS_YCCK</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>YCCK colorspace.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJCustomFilter.html#customFilter(java.nio.ShortBuffer, java.awt.Rectangle, java.awt.Rectangle, int, int, org.libjpegturbo.turbojpeg.TJTransform)"><B>customFilter(ShortBuffer, Rectangle, Rectangle, int, int, TJTransform)</B></A> - 
 Method in interface org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCustomFilter.html" title="interface in org.libjpegturbo.turbojpeg">TJCustomFilter</A>
 <DD>A callback function that can be used to modify the DCT coefficients after
  they are losslessly transformed but before they are transcoded to a new
- JPEG file.
+ JPEG image.
 </DL>
 <HR>
 <A NAME="_D_"><!-- --></A><H2>
@@ -152,14 +170,21 @@
 <DD>Decompress the JPEG source image associated with this decompressor
  instance and return a <code>BufferedImage</code> instance containing the
  decompressed image.
-<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)"><B>decompressToYUV(byte[], int)</B></A> - 
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><B>decompressToYUV(byte[], int, int, int, int)</B></A> - 
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
 <DD>Decompress the JPEG source image associated with this decompressor
  instance and output a YUV planar image to the given destination buffer.
-<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)"><B>decompressToYUV(int)</B></A> - 
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)"><B>decompressToYUV(byte[], int)</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>TJDecompressor.decompressToYUV(byte[], int, int, int, int)</CODE></A>
+ instead.</I>
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><B>decompressToYUV(int, int, int, int)</B></A> - 
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
 <DD>Decompress the JPEG source image associated with this decompressor
  instance and return a buffer containing a YUV planar image.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)"><B>decompressToYUV(int)</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><CODE>TJDecompressor.decompressToYUV(int, int, int, int)</CODE></A> instead.</I>
 </DL>
 <HR>
 <A NAME="_E_"><!-- --></A><H2>
@@ -214,20 +239,16 @@
  the underlying codec.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCEMMX"><B>FLAG_FORCEMMX</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
-<DD>Turn off CPU auto-detection and force TurboJPEG to use MMX code
- (if the underlying codec supports it.)
+<DD><B>Deprecated.</B>&nbsp;
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE"><B>FLAG_FORCESSE</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
-<DD>Turn off CPU auto-detection and force TurboJPEG to use SSE code
- (if the underlying codec supports it.)
+<DD><B>Deprecated.</B>&nbsp;
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE2"><B>FLAG_FORCESSE2</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
-<DD>Turn off CPU auto-detection and force TurboJPEG to use SSE2 code
- (if the underlying codec supports it.)
+<DD><B>Deprecated.</B>&nbsp;
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE3"><B>FLAG_FORCESSE3</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
-<DD>Turn off CPU auto-detection and force TurboJPEG to use SSE3 code
- (if the underlying codec supports it.)
+<DD><B>Deprecated.</B>&nbsp;
 </DL>
 <HR>
 <A NAME="_G_"><!-- --></A><H2>
@@ -237,6 +258,10 @@
 Static method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>For the given pixel format, returns the number of bytes that the blue
  component is offset from the start of the pixel.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#getColorspace()"><B>getColorspace()</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
+<DD>Returns the colorspace used in the JPEG image associated with this
+ decompressor instance.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html#getCompressedSize()"><B>getCompressedSize()</B></A> - 
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
 <DD>Returns the size of the image (in bytes) generated by the most recent
@@ -334,6 +359,9 @@
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBufSize"><B>jpegBufSize</B></A> - 
 Variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
 <DD>&nbsp;
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegColorspace"><B>jpegColorspace</B></A> - 
+Variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
+<DD>&nbsp;
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegHeight"><B>jpegHeight</B></A> - 
 Variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A>
 <DD>&nbsp;
@@ -348,6 +376,9 @@
 <A NAME="_N_"><!-- --></A><H2>
 <B>N</B></H2>
 <DL>
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#NUMCS"><B>NUMCS</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>The number of JPEG colorspaces
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJTransform.html#NUMOP"><B>NUMOP</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</A>
 <DD>The number of lossless transform operations
@@ -432,6 +463,9 @@
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#PF_BGRX"><B>PF_BGRX</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>BGRX pixel format.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#PF_CMYK"><B>PF_CMYK</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>CMYK pixel format.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#PF_GRAY"><B>PF_GRAY</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>Grayscale pixel format.
@@ -455,6 +489,9 @@
 <A NAME="_S_"><!-- --></A><H2>
 <B>S</B></H2>
 <DL>
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#SAMP_411"><B>SAMP_411</B></A> - 
+Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
+<DD>4:1:1 chrominance subsampling.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJ.html#SAMP_420"><B>SAMP_420</B></A> - 
 Static variable in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</A>
 <DD>4:2:0 chrominance subsampling.
@@ -484,10 +521,17 @@
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
 <DD><B>Deprecated.</B>&nbsp;<I>Use
  <A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImage(byte[], int, int, int, int, int, int)"><CODE>TJCompressor.setSourceImage(byte[], int, int, int, int, int, int)</CODE></A> instead.</I>
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImageYUV(byte[], int, int, int)"><B>setSourceImageYUV(byte[], int, int, int)</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
+<DD>Associate an uncompressed YUV planar source image with this compressor
+ instance.
 <DT><A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html#setSubsamp(int)"><B>setSubsamp(int)</B></A> - 
 Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
 <DD>Set the level of chrominance subsampling for subsequent compress/encode
  operations.
+<DT><A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html#setYUVPad(int)"><B>setYUVPad(int)</B></A> - 
+Method in class org.libjpegturbo.turbojpeg.<A HREF="./org/libjpegturbo/turbojpeg/TJCompressor.html" title="class in org.libjpegturbo.turbojpeg">TJCompressor</A>
+<DD>Set the plane padding for subsequent YUV encode operations.
 </DL>
 <HR>
 <A NAME="_T_"><!-- --></A><H2>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJ.html b/java/doc/org/libjpegturbo/turbojpeg/TJ.html
index f905406..bad022e 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJ.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJ.html
@@ -115,6 +115,46 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_CMYK">CS_CMYK</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;CMYK colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_GRAY">CS_GRAY</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Grayscale colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_RGB">CS_RGB</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;RGB colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr">CS_YCbCr</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;YCbCr colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK">CS_YCCK</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;YCCK colorspace.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_ACCURATEDCT">FLAG_ACCURATEDCT</A></B></CODE>
 
 <BR>
@@ -154,8 +194,7 @@
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCEMMX">FLAG_FORCEMMX</A></B></CODE>
 
 <BR>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Turn off CPU auto-detection and force TurboJPEG to use MMX code
- (if the underlying codec supports it.)</TD>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;</TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
@@ -163,8 +202,7 @@
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE">FLAG_FORCESSE</A></B></CODE>
 
 <BR>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Turn off CPU auto-detection and force TurboJPEG to use SSE code
- (if the underlying codec supports it.)</TD>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;</TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
@@ -172,8 +210,7 @@
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE2">FLAG_FORCESSE2</A></B></CODE>
 
 <BR>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Turn off CPU auto-detection and force TurboJPEG to use SSE2 code
- (if the underlying codec supports it.)</TD>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;</TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
@@ -181,8 +218,15 @@
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_FORCESSE3">FLAG_FORCESSE3</A></B></CODE>
 
 <BR>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Turn off CPU auto-detection and force TurboJPEG to use SSE3 code
- (if the underlying codec supports it.)</TD>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#NUMCS">NUMCS</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;The number of JPEG colorspaces</TD>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
@@ -243,6 +287,14 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_CMYK">PF_CMYK</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;CMYK pixel format.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_GRAY">PF_GRAY</A></B></CODE>
 
 <BR>
@@ -291,6 +343,14 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_411">SAMP_411</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4:1:1 chrominance subsampling.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_420">SAMP_420</A></B></CODE>
 
 <BR>
@@ -373,6 +433,17 @@
            int&nbsp;subsamp)</CODE>
 
 <BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>bufSizeYUV(int, int, int, int)</CODE></A> instead.</I></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>static&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)">bufSizeYUV</A></B>(int&nbsp;width,
+           int&nbsp;pad,
+           int&nbsp;height,
+           int&nbsp;subsamp)</CODE>
+
+<BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Returns the size of the buffer (in bytes) required to hold a YUV planar
  image with the given width, height, and level of chrominance subsampling.</TD>
 </TR>
@@ -539,6 +610,25 @@
 </DL>
 <HR>
 
+<A NAME="SAMP_411"><!-- --></A><H3>
+SAMP_411</H3>
+<PRE>
+public static final int <B>SAMP_411</B></PRE>
+<DL>
+<DD>4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+ chrominance component for every 4x1 block of pixels in the source image.
+ JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+ same size as those compressed with 4:2:0 subsampling, and in the
+ aggregate, both subsampling methods produce approximately the same
+ perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+ horizontal features.  Note that 4:1:1 subsampling is not fully accelerated
+ in libjpeg-turbo.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.SAMP_411">Constant Field Values</A></DL>
+</DL>
+<HR>
+
 <A NAME="NUMPF"><!-- --></A><H3>
 NUMPF</H3>
 <PRE>
@@ -708,6 +798,131 @@
 </DL>
 <HR>
 
+<A NAME="PF_CMYK"><!-- --></A><H3>
+PF_CMYK</H3>
+<PRE>
+public static final int <B>PF_CMYK</B></PRE>
+<DL>
+<DD>CMYK pixel format.  Unlike RGB, which is an additive color model used
+ primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+ color model used primarily for printing.  In the CMYK color model, the
+ value of each color component typically corresponds to an amount of cyan,
+ magenta, yellow, or black ink that is applied to a white background.  In
+ order to convert between CMYK and RGB, it is necessary to use a color
+ management system (CMS.)  A CMS will attempt to map colors within the
+ printer's gamut to perceptually similar colors in the display's gamut and
+ vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+ be defined with a simple formula.  Thus, such a conversion is out of scope
+ for a codec library.  However, the TurboJPEG API allows for compressing
+ CMYK pixels into a YCCK JPEG image (see <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK"><CODE>CS_YCCK</CODE></A>) and
+ decompressing YCCK JPEG images into CMYK pixels.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.PF_CMYK">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="NUMCS"><!-- --></A><H3>
+NUMCS</H3>
+<PRE>
+public static final int <B>NUMCS</B></PRE>
+<DL>
+<DD>The number of JPEG colorspaces
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.NUMCS">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_RGB"><!-- --></A><H3>
+CS_RGB</H3>
+<PRE>
+public static final int <B>CS_RGB</B></PRE>
+<DL>
+<DD>RGB colorspace.  When compressing the JPEG image, the R, G, and B
+ components in the source image are reordered into image planes, but no
+ colorspace conversion or subsampling is performed.  RGB JPEG images can be
+ decompressed to any of the extended RGB pixel formats or grayscale, but
+ they cannot be decompressed to YUV images.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_RGB">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_YCbCr"><!-- --></A><H3>
+CS_YCbCr</H3>
+<PRE>
+public static final int <B>CS_YCbCr</B></PRE>
+<DL>
+<DD>YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+ mathematical transformation of RGB designed solely for storage and
+ transmission.  YCbCr images must be converted to RGB before they can
+ actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+ component represents the black & white portion of the original image, and
+ the Cb and Cr (chrominance) components represent the color portion of the
+ original image.  Originally, the analog equivalent of this transformation
+ allowed the same signal to drive both black & white and color televisions,
+ but JPEG images use YCbCr primarily because it allows the color data to be
+ optionally subsampled for the purposes of reducing bandwidth or disk
+ space.  YCbCr is the most common JPEG colorspace, and YCbCr JPEG images
+ can be compressed from and decompressed to any of the extended RGB pixel
+ formats or grayscale, or they can be decompressed to YUV planar images.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_YCbCr">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_GRAY"><!-- --></A><H3>
+CS_GRAY</H3>
+<PRE>
+public static final int <B>CS_GRAY</B></PRE>
+<DL>
+<DD>Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+ component), and any color data from the source image is discarded.
+ Grayscale JPEG images can be compressed from and decompressed to any of
+ the extended RGB pixel formats or grayscale, or they can be decompressed
+ to YUV planar images.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_GRAY">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_CMYK"><!-- --></A><H3>
+CS_CMYK</H3>
+<PRE>
+public static final int <B>CS_CMYK</B></PRE>
+<DL>
+<DD>CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+ components in the source image are reordered into image planes, but no
+ colorspace conversion or subsampling is performed.  CMYK JPEG images can
+ only be decompressed to CMYK pixels.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_CMYK">Constant Field Values</A></DL>
+</DL>
+<HR>
+
+<A NAME="CS_YCCK"><!-- --></A><H3>
+CS_YCCK</H3>
+<PRE>
+public static final int <B>CS_YCCK</B></PRE>
+<DL>
+<DD>YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+ rather a mathematical transformation of CMYK designed solely for storage
+ and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+ reversibly transformed into YCCK, and as with YCbCr, the chrominance
+ components in the YCCK pixels can be subsampled without incurring major
+ perceptual loss.  YCCK JPEG images can only be compressed from and
+ decompressed to CMYK pixels.
+<P>
+<DL>
+<DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.CS_YCCK">Constant Field Values</A></DL>
+</DL>
+<HR>
+
 <A NAME="FLAG_BOTTOMUP"><!-- --></A><H3>
 FLAG_BOTTOMUP</H3>
 <PRE>
@@ -724,12 +939,10 @@
 <A NAME="FLAG_FORCEMMX"><!-- --></A><H3>
 FLAG_FORCEMMX</H3>
 <PRE>
-public static final int <B>FLAG_FORCEMMX</B></PRE>
+<FONT SIZE="-1">@Deprecated
+</FONT>public static final int <B>FLAG_FORCEMMX</B></PRE>
 <DL>
-<DD>Turn off CPU auto-detection and force TurboJPEG to use MMX code
- (if the underlying codec supports it.)
-<P>
-<DL>
+<DD><B>Deprecated.</B>&nbsp;<DL>
 <DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.FLAG_FORCEMMX">Constant Field Values</A></DL>
 </DL>
 <HR>
@@ -737,12 +950,10 @@
 <A NAME="FLAG_FORCESSE"><!-- --></A><H3>
 FLAG_FORCESSE</H3>
 <PRE>
-public static final int <B>FLAG_FORCESSE</B></PRE>
+<FONT SIZE="-1">@Deprecated
+</FONT>public static final int <B>FLAG_FORCESSE</B></PRE>
 <DL>
-<DD>Turn off CPU auto-detection and force TurboJPEG to use SSE code
- (if the underlying codec supports it.)
-<P>
-<DL>
+<DD><B>Deprecated.</B>&nbsp;<DL>
 <DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.FLAG_FORCESSE">Constant Field Values</A></DL>
 </DL>
 <HR>
@@ -750,12 +961,10 @@
 <A NAME="FLAG_FORCESSE2"><!-- --></A><H3>
 FLAG_FORCESSE2</H3>
 <PRE>
-public static final int <B>FLAG_FORCESSE2</B></PRE>
+<FONT SIZE="-1">@Deprecated
+</FONT>public static final int <B>FLAG_FORCESSE2</B></PRE>
 <DL>
-<DD>Turn off CPU auto-detection and force TurboJPEG to use SSE2 code
- (if the underlying codec supports it.)
-<P>
-<DL>
+<DD><B>Deprecated.</B>&nbsp;<DL>
 <DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.FLAG_FORCESSE2">Constant Field Values</A></DL>
 </DL>
 <HR>
@@ -763,12 +972,10 @@
 <A NAME="FLAG_FORCESSE3"><!-- --></A><H3>
 FLAG_FORCESSE3</H3>
 <PRE>
-public static final int <B>FLAG_FORCESSE3</B></PRE>
+<FONT SIZE="-1">@Deprecated
+</FONT>public static final int <B>FLAG_FORCESSE3</B></PRE>
 <DL>
-<DD>Turn off CPU auto-detection and force TurboJPEG to use SSE3 code
- (if the underlying codec supports it.)
-<P>
-<DL>
+<DD><B>Deprecated.</B>&nbsp;<DL>
 <DT><B>See Also:</B><DD><A HREF="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.FLAG_FORCESSE3">Constant Field Values</A></DL>
 </DL>
 <HR>
@@ -991,10 +1198,11 @@
 </DL>
 <HR>
 
-<A NAME="bufSizeYUV(int, int, int)"><!-- --></A><H3>
+<A NAME="bufSizeYUV(int, int, int, int)"><!-- --></A><H3>
 bufSizeYUV</H3>
 <PRE>
 public static int <B>bufSizeYUV</B>(int&nbsp;width,
+                             int&nbsp;pad,
                              int&nbsp;height,
                              int&nbsp;subsamp)
                       throws java.lang.Exception</PRE>
@@ -1003,7 +1211,9 @@
  image with the given width, height, and level of chrominance subsampling.
 <P>
 <DD><DL>
-<DT><B>Parameters:</B><DD><CODE>width</CODE> - the width (in pixels) of the YUV image<DD><CODE>height</CODE> - the height (in pixels) of the YUV image<DD><CODE>subsamp</CODE> - the level of chrominance subsampling used in the YUV
+<DT><B>Parameters:</B><DD><CODE>width</CODE> - the width (in pixels) of the YUV image<DD><CODE>pad</CODE> - the width of each line in each plane of the image is padded to
+        the nearest multiple of this number of bytes (must be a power of
+        2.)<DD><CODE>height</CODE> - the height (in pixels) of the YUV image<DD><CODE>subsamp</CODE> - the level of chrominance subsampling used in the YUV
  image (one of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.SAMP_*</CODE></A>)
 <DT><B>Returns:</B><DD>the size of the buffer (in bytes) required to hold a YUV planar
  image with the given width, height, and level of chrominance subsampling
@@ -1013,6 +1223,25 @@
 </DL>
 <HR>
 
+<A NAME="bufSizeYUV(int, int, int)"><!-- --></A><H3>
+bufSizeYUV</H3>
+<PRE>
+<FONT SIZE="-1">@Deprecated
+</FONT>public static int <B>bufSizeYUV</B>(int&nbsp;width,
+                                        int&nbsp;height,
+                                        int&nbsp;subsamp)
+                      throws java.lang.Exception</PRE>
+<DL>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>bufSizeYUV(int, int, int, int)</CODE></A> instead.</I>
+<P>
+<DD><DL>
+
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
 <A NAME="getScalingFactors()"><!-- --></A><H3>
 getScalingFactors</H3>
 <PRE>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html b/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
index 7fa3d0f..35114c7 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
@@ -298,12 +298,32 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>&nbsp;void</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSourceImageYUV(byte[], int, int, int)">setSourceImageYUV</A></B>(byte[]&nbsp;srcImage,
+                  int&nbsp;width,
+                  int&nbsp;pad,
+                  int&nbsp;height)</CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Associate an uncompressed YUV planar source image with this compressor
+ instance.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;void</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSubsamp(int)">setSubsamp</A></B>(int&nbsp;newSubsamp)</CODE>
 
 <BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Set the level of chrominance subsampling for subsequent compress/encode
  operations.</TD>
 </TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;void</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setYUVPad(int)">setYUVPad</A></B>(int&nbsp;pad)</CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Set the plane padding for subsequent YUV encode operations.</TD>
+</TR>
 </TABLE>
 &nbsp;<A NAME="methods_inherited_from_class_java.lang.Object"><!-- --></A>
 <TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
@@ -412,8 +432,8 @@
 <DD>Associate an uncompressed source image with this compressor instance.
 <P>
 <DD><DL>
-<DT><B>Parameters:</B><DD><CODE>srcImage</CODE> - image buffer containing RGB or grayscale pixels to be
- compressed<DD><CODE>x</CODE> - x offset (in pixels) of the region from which the JPEG image
+<DT><B>Parameters:</B><DD><CODE>srcImage</CODE> - image buffer containing RGB, grayscale, or CMYK pixels to
+ be compressed<DD><CODE>x</CODE> - x offset (in pixels) of the region from which the JPEG image
  should be compressed, relative to the start of <code>srcImage</code>.<DD><CODE>y</CODE> - y offset (in pixels) of the region from which the JPEG image
  should be compressed, relative to the start of <code>srcImage</code>.<DD><CODE>width</CODE> - width (in pixels) of the region in the source image from
  which the JPEG image should be compressed.<DD><CODE>pitch</CODE> - bytes per line of the source image.  Normally, this should be
@@ -435,11 +455,12 @@
 <A NAME="setSourceImage(byte[], int, int, int, int)"><!-- --></A><H3>
 setSourceImage</H3>
 <PRE>
-public void <B>setSourceImage</B>(byte[]&nbsp;srcImage,
-                           int&nbsp;width,
-                           int&nbsp;pitch,
-                           int&nbsp;height,
-                           int&nbsp;pixelFormat)
+<FONT SIZE="-1">@Deprecated
+</FONT>public void <B>setSourceImage</B>(byte[]&nbsp;srcImage,
+                                      int&nbsp;width,
+                                      int&nbsp;pitch,
+                                      int&nbsp;height,
+                                      int&nbsp;pixelFormat)
                     throws java.lang.Exception</PRE>
 <DL>
 <DD><B>Deprecated.</B>&nbsp;<I>Use
@@ -453,6 +474,38 @@
 </DL>
 <HR>
 
+<A NAME="setSourceImageYUV(byte[], int, int, int)"><!-- --></A><H3>
+setSourceImageYUV</H3>
+<PRE>
+public void <B>setSourceImageYUV</B>(byte[]&nbsp;srcImage,
+                              int&nbsp;width,
+                              int&nbsp;pad,
+                              int&nbsp;height)
+                       throws java.lang.Exception</PRE>
+<DL>
+<DD>Associate an uncompressed YUV planar source image with this compressor
+ instance.
+<P>
+<DD><DL>
+<DT><B>Parameters:</B><DD><CODE>srcImage</CODE> - image buffer containing a YUV planar image to be
+ compressed.  The Y, U (Cb), and V (Cr) image planes should be stored
+ sequentially in the buffer, and the size of each plane is determined by
+ the specified width, height, and padding, as well as the level of
+ chrominance subsampling (specified using <A HREF="../../../org/libjpegturbo/turbojpeg/TJCompressor.html#setSubsamp(int)"><CODE>setSubsamp(int)</CODE></A>.)  If the
+ chrominance components are subsampled along the horizontal dimension, then
+ the width of the luminance plane should be padded to the nearest multiple
+ of 2 (same goes for the height of the luminance plane, if the chrominance
+ components are subsampled along the vertical dimension.)  This is
+ irrespective of any additional padding specified in the <code>pad</code>
+ parameter.<DD><CODE>width</CODE> - width (in pixels) of the source image<DD><CODE>pad</CODE> - the line padding used in the source image.  For instance, if
+ each line in each plane of the YUV image is padded to the nearest multiple
+ of 4 bytes, then <code>pad</code> should be set to 4.<DD><CODE>height</CODE> - height (in pixels) of the source image
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
 <A NAME="setSubsamp(int)"><!-- --></A><H3>
 setSubsamp</H3>
 <PRE>
@@ -460,7 +513,17 @@
                 throws java.lang.Exception</PRE>
 <DL>
 <DD>Set the level of chrominance subsampling for subsequent compress/encode
- operations.
+ operations.  When pixels are converted from RGB to YCbCr (see
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCbCr"><CODE>TJ.CS_YCbCr</CODE></A>) or from CMYK to YCCK (see <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#CS_YCCK"><CODE>TJ.CS_YCCK</CODE></A>) as part
+ of the JPEG compression process, some of the Cb and Cr (chrominance)
+ components can be discarded or averaged together to produce a smaller
+ image with little perceptible loss of image clarity (the human eye is more
+ sensitive to small changes in brightness than to small changes in color.)
+ This is called "chrominance subsampling".
+ <p>
+ NOTE: When compressing a YUV planar image into a JPEG image, this method
+ also specifies the level of chrominance subsampling used in the source
+ image.
 <P>
 <DD><DL>
 <DT><B>Parameters:</B><DD><CODE>newSubsamp</CODE> - the new level of chrominance subsampling (one of
@@ -570,6 +633,25 @@
 </DL>
 <HR>
 
+<A NAME="setYUVPad(int)"><!-- --></A><H3>
+setYUVPad</H3>
+<PRE>
+public void <B>setYUVPad</B>(int&nbsp;pad)
+               throws java.lang.Exception</PRE>
+<DL>
+<DD>Set the plane padding for subsequent YUV encode operations.
+<P>
+<DD><DL>
+<DT><B>Parameters:</B><DD><CODE>pad</CODE> - the width of each line in each plane of the YUV image will be
+        padded to the nearest multiple of this number of bytes (must be a
+        power of 2.)  The default padding is 4 bytes, which generates
+        images suitable for direct video display.
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
 <A NAME="encodeYUV(byte[], int)"><!-- --></A><H3>
 encodeYUV</H3>
 <PRE>
@@ -579,17 +661,16 @@
 <DL>
 <DD>Encode the uncompressed source image associated with this compressor
  instance and output a YUV planar image to the given destination buffer.
- This method uses the accelerated color conversion routines in
- TurboJPEG's underlying codec to produce a planar YUV image that is
- suitable for direct video display.  Specifically, if the chrominance
- components are subsampled along the horizontal dimension, then the width
- of the luminance plane is padded to the nearest multiple of 2 in the
- output image (same goes for the height of the luminance plane, if the
+ This method uses the accelerated color conversion routines in TurboJPEG's
+ underlying codec but does not execute any of the other steps in the JPEG
+ compression process.  The Y, U (Cb), and V (Cr) image planes are stored
+ sequentially into the destination buffer, and the size of each plane is
+ determined by the width and height of the source image, as well as the
+ specified padding and level of chrominance subsampling.  If the
+ chrominance components are subsampled along the horizontal dimension, then
+ the width of the luminance plane is padded to the nearest multiple of 2 in
+ the output image (same goes for the height of the luminance plane, if the
  chrominance components are subsampled along the vertical dimension.)
- Also, each line of each plane in the output image is padded to 4 bytes.
- Although this will work with any subsampling option, it is really only
- useful in combination with <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_420"><CODE>TJ.SAMP_420</CODE></A>, which produces an image
- compatible with the I420 (AKA "YUV420P") format.
  <p>
  NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
  convention of the digital video community, the TurboJPEG API uses "YUV" to
@@ -597,7 +678,7 @@
 <P>
 <DD><DL>
 <DT><B>Parameters:</B><DD><CODE>dstBuf</CODE> - buffer that will receive the YUV planar image.  Use
- <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int)</CODE></A> to determine the appropriate size for this buffer
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> to determine the appropriate size for this buffer
  based on the image width, height, and level of chrominance subsampling.<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
 <DT><B>Throws:</B>
 <DD><CODE>java.lang.Exception</CODE></DL>
@@ -639,7 +720,7 @@
 <DD><DL>
 <DT><B>Parameters:</B><DD><CODE>srcImage</CODE> - a <code>BufferedImage</code> instance containing RGB or
  grayscale pixels to be encoded<DD><CODE>dstBuf</CODE> - buffer that will receive the YUV planar image.  Use
- <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int)</CODE></A> to determine the appropriate size for this buffer
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> to determine the appropriate size for this buffer
  based on the image width, height, and level of chrominance subsampling.<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
 <DT><B>Throws:</B>
 <DD><CODE>java.lang.Exception</CODE></DL>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html b/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
index 707b58d..3291c71 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
@@ -122,7 +122,7 @@
 <BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;A callback function that can be used to modify the DCT coefficients after
  they are losslessly transformed but before they are transcoded to a new
- JPEG file.</TD>
+ JPEG image.</TD>
 </TR>
 </TABLE>
 &nbsp;
@@ -151,7 +151,7 @@
 <DL>
 <DD>A callback function that can be used to modify the DCT coefficients after
  they are losslessly transformed but before they are transcoded to a new
- JPEG file.  This allows for custom filters or other transformations to be
+ JPEG image.  This allows for custom filters or other transformations to be
  applied in the frequency domain.
 <P>
 <DD><DL>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html b/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
index a437c16..2dc3cc6 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
@@ -142,6 +142,14 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>protected &nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegColorspace">jpegColorspace</A></B></CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>protected &nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegHeight">jpegHeight</A></B></CODE>
 
 <BR>
@@ -303,6 +311,19 @@
                 int&nbsp;flags)</CODE>
 
 <BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A>
+ instead.</I></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;void</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)">decompressToYUV</A></B>(byte[]&nbsp;dstBuf,
+                int&nbsp;desiredWidth,
+                int&nbsp;pad,
+                int&nbsp;desiredHeight,
+                int&nbsp;flags)</CODE>
+
+<BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Decompress the JPEG source image associated with this decompressor
  instance and output a YUV planar image to the given destination buffer.</TD>
 </TR>
@@ -312,6 +333,17 @@
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)">decompressToYUV</A></B>(int&nbsp;flags)</CODE>
 
 <BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><CODE>decompressToYUV(int, int, int, int)</CODE></A> instead.</I></TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;byte[]</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)">decompressToYUV</A></B>(int&nbsp;desiredWidth,
+                int&nbsp;pad,
+                int&nbsp;desiredHeight,
+                int&nbsp;flags)</CODE>
+
+<BR>
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Decompress the JPEG source image associated with this decompressor
  instance and return a buffer containing a YUV planar image.</TD>
 </TR>
@@ -326,6 +358,15 @@
 <TR BGCOLOR="white" CLASS="TableRowColor">
 <TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
 <CODE>&nbsp;int</CODE></FONT></TD>
+<TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getColorspace()">getColorspace</A></B>()</CODE>
+
+<BR>
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Returns the colorspace used in the JPEG image associated with this
+ decompressor instance.</TD>
+</TR>
+<TR BGCOLOR="white" CLASS="TableRowColor">
+<TD ALIGN="right" VALIGN="top" WIDTH="1%"><FONT SIZE="-1">
+<CODE>&nbsp;int</CODE></FONT></TD>
 <TD><CODE><B><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getHeight()">getHeight</A></B>()</CODE>
 
 <BR>
@@ -480,6 +521,16 @@
 <DL>
 </DL>
 </DL>
+<HR>
+
+<A NAME="jpegColorspace"><!-- --></A><H3>
+jpegColorspace</H3>
+<PRE>
+protected int <B>jpegColorspace</B></PRE>
+<DL>
+<DL>
+</DL>
+</DL>
 
 <!-- ========= CONSTRUCTOR DETAIL ======== -->
 
@@ -614,7 +665,7 @@
                throws java.lang.Exception</PRE>
 <DL>
 <DD>Returns the level of chrominance subsampling used in the JPEG image
- associated with this decompressor instance.
+ associated with this decompressor instance.  See <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.SAMP_*</CODE></A>.
 <P>
 <DD><DL>
 
@@ -626,6 +677,25 @@
 </DL>
 <HR>
 
+<A NAME="getColorspace()"><!-- --></A><H3>
+getColorspace</H3>
+<PRE>
+public int <B>getColorspace</B>()
+                  throws java.lang.Exception</PRE>
+<DL>
+<DD>Returns the colorspace used in the JPEG image associated with this
+ decompressor instance.  See <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.CS_*</CODE></A>.
+<P>
+<DD><DL>
+
+<DT><B>Returns:</B><DD>the colorspace used in the JPEG image associated with this
+ decompressor instance
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
 <A NAME="getJPEGBuf()"><!-- --></A><H3>
 getJPEGBuf</H3>
 <PRE>
@@ -777,12 +847,13 @@
 <A NAME="decompress(byte[], int, int, int, int, int)"><!-- --></A><H3>
 decompress</H3>
 <PRE>
-public void <B>decompress</B>(byte[]&nbsp;dstBuf,
-                       int&nbsp;desiredWidth,
-                       int&nbsp;pitch,
-                       int&nbsp;desiredHeight,
-                       int&nbsp;pixelFormat,
-                       int&nbsp;flags)
+<FONT SIZE="-1">@Deprecated
+</FONT>public void <B>decompress</B>(byte[]&nbsp;dstBuf,
+                                  int&nbsp;desiredWidth,
+                                  int&nbsp;pitch,
+                                  int&nbsp;desiredHeight,
+                                  int&nbsp;pixelFormat,
+                                  int&nbsp;flags)
                 throws java.lang.Exception</PRE>
 <DL>
 <DD><B>Deprecated.</B>&nbsp;<I>Use
@@ -825,10 +896,13 @@
 </DL>
 <HR>
 
-<A NAME="decompressToYUV(byte[], int)"><!-- --></A><H3>
+<A NAME="decompressToYUV(byte[], int, int, int, int)"><!-- --></A><H3>
 decompressToYUV</H3>
 <PRE>
 public void <B>decompressToYUV</B>(byte[]&nbsp;dstBuf,
+                            int&nbsp;desiredWidth,
+                            int&nbsp;pad,
+                            int&nbsp;desiredHeight,
                             int&nbsp;flags)
                      throws java.lang.Exception</PRE>
 <DL>
@@ -848,8 +922,64 @@
 <P>
 <DD><DL>
 <DT><B>Parameters:</B><DD><CODE>dstBuf</CODE> - buffer that will receive the YUV planar image.  Use
- <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int)</CODE></A> to determine the appropriate size for this buffer
- based on the image width, height, and level of chrominance subsampling.<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html#bufSizeYUV(int, int, int, int)"><CODE>TJ.bufSizeYUV(int, int, int, int)</CODE></A> to determine the appropriate size for this buffer
+ based on the image width, height, and level of chrominance subsampling.<DD><CODE>desiredWidth</CODE> - desired width (in pixels) of the YUV image.  If the
+ desired image dimensions are different than the dimensions of the JPEG
+ image being decompressed, then TurboJPEG will use scaling in the JPEG
+ decompressor to generate the largest possible image that will fit within
+ the desired dimensions.  Setting this to 0 is the same as setting it to
+ the width of the JPEG image (in other words, the width will not be
+ considered when determining the scaled image size.)<DD><CODE>pad</CODE> - the width of each line in each plane of the YUV image will be
+ padded to the nearest multiple of this number of bytes (must be a power of
+ 2.)<DD><CODE>desiredHeight</CODE> - desired height (in pixels) of the YUV image.  If the
+ desired image dimensions are different than the dimensions of the JPEG
+ image being decompressed, then TurboJPEG will use scaling in the JPEG
+ decompressor to generate the largest possible image that will fit within
+ the desired dimensions.  Setting this to 0 is the same as setting it to
+ the height of the JPEG image (in other words, the height will not be
+ considered when determining the scaled image size.)<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
+<A NAME="decompressToYUV(byte[], int)"><!-- --></A><H3>
+decompressToYUV</H3>
+<PRE>
+<FONT SIZE="-1">@Deprecated
+</FONT>public void <B>decompressToYUV</B>(byte[]&nbsp;dstBuf,
+                                       int&nbsp;flags)
+                     throws java.lang.Exception</PRE>
+<DL>
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A>
+ instead.</I>
+<P>
+<DD><DL>
+
+<DT><B>Throws:</B>
+<DD><CODE>java.lang.Exception</CODE></DL>
+</DD>
+</DL>
+<HR>
+
+<A NAME="decompressToYUV(int, int, int, int)"><!-- --></A><H3>
+decompressToYUV</H3>
+<PRE>
+public byte[] <B>decompressToYUV</B>(int&nbsp;desiredWidth,
+                              int&nbsp;pad,
+                              int&nbsp;desiredHeight,
+                              int&nbsp;flags)
+                       throws java.lang.Exception</PRE>
+<DL>
+<DD>Decompress the JPEG source image associated with this decompressor
+ instance and return a buffer containing a YUV planar image.  See <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A> for more detail.
+<P>
+<DD><DL>
+<DT><B>Parameters:</B><DD><CODE>desiredWidth</CODE> - see
+ <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A> for description<DD><CODE>pad</CODE> - see <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A> for
+ description<DD><CODE>desiredHeight</CODE> - see <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)"><CODE>decompressToYUV(byte[], int, int, int, int)</CODE></A> for description<DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
+<DT><B>Returns:</B><DD>a buffer containing a YUV planar image
 <DT><B>Throws:</B>
 <DD><CODE>java.lang.Exception</CODE></DL>
 </DD>
@@ -859,15 +989,14 @@
 <A NAME="decompressToYUV(int)"><!-- --></A><H3>
 decompressToYUV</H3>
 <PRE>
-public byte[] <B>decompressToYUV</B>(int&nbsp;flags)
+<FONT SIZE="-1">@Deprecated
+</FONT>public byte[] <B>decompressToYUV</B>(int&nbsp;flags)
                        throws java.lang.Exception</PRE>
 <DL>
-<DD>Decompress the JPEG source image associated with this decompressor
- instance and return a buffer containing a YUV planar image.  See <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)"><CODE>decompressToYUV(byte[], int)</CODE></A> for more detail.
+<DD><B>Deprecated.</B>&nbsp;<I>Use <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)"><CODE>decompressToYUV(int, int, int, int)</CODE></A> instead.</I>
 <P>
 <DD><DL>
-<DT><B>Parameters:</B><DD><CODE>flags</CODE> - the bitwise OR of one or more of <A HREF="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><CODE>TJ.FLAG_*</CODE></A>
-<DT><B>Returns:</B><DD>a buffer containing a YUV planar image
+
 <DT><B>Throws:</B>
 <DD><CODE>java.lang.Exception</CODE></DL>
 </DD>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html b/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
index 1e76ac8..0811b51 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
@@ -120,7 +120,7 @@
 <TH ALIGN="left"><B>Fields inherited from class org.libjpegturbo.turbojpeg.<A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A></B></TH>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
-<TD><CODE><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#handle">handle</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBuf">jpegBuf</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBufSize">jpegBufSize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegHeight">jpegHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegSubsamp">jpegSubsamp</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegWidth">jpegWidth</A></CODE></TD>
+<TD><CODE><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#handle">handle</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBuf">jpegBuf</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegBufSize">jpegBufSize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegColorspace">jpegColorspace</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegHeight">jpegHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegSubsamp">jpegSubsamp</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#jpegWidth">jpegWidth</A></CODE></TD>
 </TR>
 </TABLE>
 &nbsp;
@@ -203,7 +203,7 @@
 <TH ALIGN="left"><B>Methods inherited from class org.libjpegturbo.turbojpeg.<A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg">TJDecompressor</A></B></TH>
 </TR>
 <TR BGCOLOR="white" CLASS="TableRowColor">
-<TD><CODE><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#close()">close</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(java.awt.image.BufferedImage, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int[], int, int, int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#finalize()">finalize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getHeight()">getHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getJPEGBuf()">getJPEGBuf</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getJPEGSize()">getJPEGSize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getScaledHeight(int, int)">getScaledHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getScaledWidth(int, int)">getScaledWidth</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getSubsamp()">getSubsamp</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getWidth()">getWidth</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#setJPEGImage(byte[], int)">setJPEGImage</A></CODE></TD>
+<TD><CODE><A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#close()">close</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(java.awt.image.BufferedImage, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(byte[], int, int, int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int[], int, int, int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompress(int, int, int, int, int)">decompress</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(byte[], int, int, int, int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#decompressToYUV(int, int, int, int)">decompressToYUV</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#finalize()">finalize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getColorspace()">getColorspace</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getHeight()">getHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getJPEGBuf()">getJPEGBuf</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getJPEGSize()">getJPEGSize</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getScaledHeight(int, int)">getScaledHeight</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getScaledWidth(int, int)">getScaledWidth</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getSubsamp()">getSubsamp</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#getWidth()">getWidth</A>, <A HREF="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html#setJPEGImage(byte[], int)">setJPEGImage</A></CODE></TD>
 </TR>
 </TABLE>
 &nbsp;<A NAME="methods_inherited_from_class_java.lang.Object"><!-- --></A>
diff --git a/java/org/libjpegturbo/turbojpeg/TJ.java b/java/org/libjpegturbo/turbojpeg/TJ.java
index 6c6a95d..ac4a4dd 100644
--- a/java/org/libjpegturbo/turbojpeg/TJ.java
+++ b/java/org/libjpegturbo/turbojpeg/TJ.java
@@ -37,7 +37,7 @@
   /**
    * The number of chrominance subsampling options
    */
-  public static final int NUMSAMP   = 5;
+  public static final int NUMSAMP   = 6;
   /**
    * 4:4:4 chrominance subsampling (no chrominance subsampling).  The JPEG
    * or YUV image will contain one chrominance component for every pixel in the
@@ -64,6 +64,17 @@
    * Note that 4:4:0 subsampling is not fully accelerated in libjpeg-turbo.
    */
   public static final int SAMP_440  = 4;
+  /**
+   * 4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 4x1 block of pixels in the source image.
+   * JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+   * horizontal features.  Note that 4:1:1 subsampling is not fully accelerated
+   * in libjpeg-turbo.
+   */
+  public static final int SAMP_411  = 5;
 
 
   /**
@@ -82,7 +93,7 @@
   }
 
   private static final int[] mcuWidth = {
-    8, 16, 16, 8, 8
+    8, 16, 16, 8, 8, 32
   };
 
 
@@ -103,14 +114,14 @@
   }
 
   private static final int[] mcuHeight = {
-    8, 8, 16, 8, 16
+    8, 8, 16, 8, 16, 8
   };
 
 
   /**
    * The number of pixel formats
    */
-  public static final int NUMPF   = 11;
+  public static final int NUMPF   = 12;
   /**
    * RGB pixel format.  The red, green, and blue components in the image are
    * stored in 3-byte pixels in the order R, G, B from lowest to highest byte
@@ -180,6 +191,22 @@
    * interpreted as an opaque alpha channel.
    */
   public static final int PF_ARGB = 10;
+  /**
+   * CMYK pixel format.  Unlike RGB, which is an additive color model used
+   * primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+   * color model used primarily for printing.  In the CMYK color model, the
+   * value of each color component typically corresponds to an amount of cyan,
+   * magenta, yellow, or black ink that is applied to a white background.  In
+   * order to convert between CMYK and RGB, it is necessary to use a color
+   * management system (CMS.)  A CMS will attempt to map colors within the
+   * printer's gamut to perceptually similar colors in the display's gamut and
+   * vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+   * be defined with a simple formula.  Thus, such a conversion is out of scope
+   * for a codec library.  However, the TurboJPEG API allows for compressing
+   * CMYK pixels into a YCCK JPEG image (see {@link #CS_YCCK}) and
+   * decompressing YCCK JPEG images into CMYK pixels.
+   */
+  public static final int PF_CMYK = 11;
 
 
   /**
@@ -196,7 +223,7 @@
   }
 
   private static final int[] pixelSize = {
-    3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4
+    3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4
   };
 
 
@@ -218,7 +245,7 @@
   }
 
   private static final int[] redOffset = {
-    0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1
+    0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1, -1
   };
 
 
@@ -240,7 +267,7 @@
   }
 
   private static final int[] greenOffset = {
-    1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2
+    1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2, -1
   };
 
 
@@ -262,35 +289,80 @@
   }
 
   private static final int[] blueOffset = {
-    2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3
+    2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3, -1
   };
 
 
   /**
+   * The number of JPEG colorspaces
+   */
+  public static final int NUMCS = 5;
+  /**
+   * RGB colorspace.  When compressing the JPEG image, the R, G, and B
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  RGB JPEG images can be
+   * decompressed to any of the extended RGB pixel formats or grayscale, but
+   * they cannot be decompressed to YUV images.
+   */
+  public static final int CS_RGB = 0;
+  /**
+   * YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+   * mathematical transformation of RGB designed solely for storage and
+   * transmission.  YCbCr images must be converted to RGB before they can
+   * actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+   * component represents the black & white portion of the original image, and
+   * the Cb and Cr (chrominance) components represent the color portion of the
+   * original image.  Originally, the analog equivalent of this transformation
+   * allowed the same signal to drive both black & white and color televisions,
+   * but JPEG images use YCbCr primarily because it allows the color data to be
+   * optionally subsampled for the purposes of reducing bandwidth or disk
+   * space.  YCbCr is the most common JPEG colorspace, and YCbCr JPEG images
+   * can be compressed from and decompressed to any of the extended RGB pixel
+   * formats or grayscale, or they can be decompressed to YUV planar images.
+   */
+  public static final int CS_YCbCr = 1;
+  /**
+   * Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+   * component), and any color data from the source image is discarded.
+   * Grayscale JPEG images can be compressed from and decompressed to any of
+   * the extended RGB pixel formats or grayscale, or they can be decompressed
+   * to YUV planar images.
+   */
+  public static final int CS_GRAY = 2;
+  /**
+   * CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  CMYK JPEG images can
+   * only be decompressed to CMYK pixels.
+   */
+  public static final int CS_CMYK = 3;
+  /**
+   * YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+   * rather a mathematical transformation of CMYK designed solely for storage
+   * and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+   * reversibly transformed into YCCK, and as with YCbCr, the chrominance
+   * components in the YCCK pixels can be subsampled without incurring major
+   * perceptual loss.  YCCK JPEG images can only be compressed from and
+   * decompressed to CMYK pixels.
+   */
+  public static final int CS_YCCK = 4;
+
+
+  /**
    * The uncompressed source/destination image is stored in bottom-up (Windows,
    * OpenGL) order, not top-down (X11) order.
    */
   public static final int FLAG_BOTTOMUP     = 2;
-  /**
-   * Turn off CPU auto-detection and force TurboJPEG to use MMX code
-   * (if the underlying codec supports it.)
-   */
+
+  @Deprecated
   public static final int FLAG_FORCEMMX     = 8;
-  /**
-   * Turn off CPU auto-detection and force TurboJPEG to use SSE code
-   * (if the underlying codec supports it.)
-   */
+  @Deprecated
   public static final int FLAG_FORCESSE     = 16;
-  /**
-   * Turn off CPU auto-detection and force TurboJPEG to use SSE2 code
-   * (if the underlying codec supports it.)
-   */
+  @Deprecated
   public static final int FLAG_FORCESSE2    = 32;
-  /**
-   * Turn off CPU auto-detection and force TurboJPEG to use SSE3 code
-   * (if the underlying codec supports it.)
-   */
+  @Deprecated
   public static final int FLAG_FORCESSE3    = 128;
+
   /**
    * When decompressing an image that was compressed using chrominance
    * subsampling, use the fastest chrominance upsampling algorithm available in
@@ -343,6 +415,10 @@
    *
    * @param width the width (in pixels) of the YUV image
    *
+   * @param pad the width of each line in each plane of the image is padded to
+   *        the nearest multiple of this number of bytes (must be a power of
+   *        2.)
+   *
    * @param height the height (in pixels) of the YUV image
    *
    * @param subsamp the level of chrominance subsampling used in the YUV
@@ -351,6 +427,14 @@
    * @return the size of the buffer (in bytes) required to hold a YUV planar
    * image with the given width, height, and level of chrominance subsampling
    */
+  public static native int bufSizeYUV(int width, int pad, int height,
+                                      int subsamp)
+    throws Exception;
+
+  /**
+   * @deprecated Use {@link #bufSizeYUV(int, int, int, int)} instead.
+   */
+  @Deprecated
   public static native int bufSizeYUV(int width, int height, int subsamp)
     throws Exception;
 
diff --git a/java/org/libjpegturbo/turbojpeg/TJCompressor.java b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
index 52ae613..63a7fa5 100644
--- a/java/org/libjpegturbo/turbojpeg/TJCompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
@@ -95,8 +95,8 @@
   /**
    * Associate an uncompressed source image with this compressor instance.
    *
-   * @param srcImage image buffer containing RGB or grayscale pixels to be
-   * compressed
+   * @param srcImage image buffer containing RGB, grayscale, or CMYK pixels to
+   * be compressed
    *
    * @param x x offset (in pixels) of the region from which the JPEG image
    * should be compressed, relative to the start of <code>srcImage</code>.
@@ -139,22 +139,69 @@
     srcPixelFormat = pixelFormat;
     srcX = x;
     srcY = y;
+    srcIsYUV = false;
   }
 
   /**
    * @deprecated Use
    * {@link #setSourceImage(byte[], int, int, int, int, int, int)} instead.
    */
+  @Deprecated
   public void setSourceImage(byte[] srcImage, int width, int pitch,
                              int height, int pixelFormat) throws Exception {
     setSourceImage(srcImage, 0, 0, width, pitch, height, pixelFormat);
     srcX = srcY = -1;
   }
 
+  /**
+   * Associate an uncompressed YUV planar source image with this compressor
+   * instance.
+   *
+   * @param srcImage image buffer containing a YUV planar image to be
+   * compressed.  The Y, U (Cb), and V (Cr) image planes should be stored
+   * sequentially in the buffer, and the size of each plane is determined by
+   * the specified width, height, and padding, as well as the level of
+   * chrominance subsampling (specified using {@link #setSubsamp}.)  If the
+   * chrominance components are subsampled along the horizontal dimension, then
+   * the width of the luminance plane should be padded to the nearest multiple
+   * of 2 (same goes for the height of the luminance plane, if the chrominance
+   * components are subsampled along the vertical dimension.)  This is
+   * irrespective of any additional padding specified in the <code>pad</code>
+   * parameter.
+   *
+   * @param width width (in pixels) of the source image
+   *
+   * @param pad the line padding used in the source image.  For instance, if
+   * each line in each plane of the YUV image is padded to the nearest multiple
+   * of 4 bytes, then <code>pad</code> should be set to 4.
+   *
+   * @param height height (in pixels) of the source image
+   */
+  public void setSourceImageYUV(byte[] srcImage, int width, int pad,
+                                int height) throws Exception {
+    if (handle == 0) init();
+    if (srcImage == null || width < 1 || pad < 1 || height < 1)
+      throw new Exception("Invalid argument in setSourceImageYUV()");
+    srcBuf = srcImage;
+    srcWidth = width;
+    srcYUVPad = pad;
+    srcHeight = height;
+    srcIsYUV = true;
+  }
 
   /**
    * Set the level of chrominance subsampling for subsequent compress/encode
-   * operations.
+   * operations.  When pixels are converted from RGB to YCbCr (see
+   * {@link TJ#CS_YCbCr}) or from CMYK to YCCK (see {@link TJ#CS_YCCK}) as part
+   * of the JPEG compression process, some of the Cb and Cr (chrominance)
+   * components can be discarded or averaged together to produce a smaller
+   * image with little perceptible loss of image clarity (the human eye is more
+   * sensitive to small changes in brightness than to small changes in color.)
+   * This is called "chrominance subsampling".
+   * <p>
+   * NOTE: When compressing a YUV planar image into a JPEG image, this method
+   * also specifies the level of chrominance subsampling used in the source
+   * image.
    *
    * @param newSubsamp the new level of chrominance subsampling (one of
    * {@link TJ TJ.SAMP_*})
@@ -196,14 +243,19 @@
       throw new Exception("JPEG Quality not set");
     if (subsamp < 0)
       throw new Exception("Subsampling level not set");
-    if (srcX >= 0 && srcY >= 0)
-      compressedSize = compress(srcBuf, srcX, srcY, srcWidth, srcPitch,
-                                srcHeight, srcPixelFormat, dstBuf, subsamp,
-                                jpegQuality, flags);
-    else
-      compressedSize = compress(srcBuf, srcWidth, srcPitch, srcHeight,
-                                srcPixelFormat, dstBuf, subsamp, jpegQuality,
-                                flags);
+    if (srcIsYUV)
+      compressedSize = compressFromYUV(srcBuf, srcWidth, srcYUVPad, srcHeight,
+                                       subsamp, dstBuf, jpegQuality, flags);
+    else {
+      if (srcX >= 0 && srcY >= 0)
+        compressedSize = compress(srcBuf, srcX, srcY, srcWidth, srcPitch,
+                                  srcHeight, srcPixelFormat, dstBuf, subsamp,
+                                  jpegQuality, flags);
+      else
+        compressedSize = compress(srcBuf, srcWidth, srcPitch, srcHeight,
+                                  srcPixelFormat, dstBuf, subsamp, jpegQuality,
+                                  flags);
+    }
   }
 
   /**
@@ -330,20 +382,34 @@
     return buf;
   }
 
+
+  /**
+   * Set the plane padding for subsequent YUV encode operations.
+   *
+   * @param pad the width of each line in each plane of the YUV image will be
+   *        padded to the nearest multiple of this number of bytes (must be a
+   *        power of 2.)  The default padding is 4 bytes, which generates
+   *        images suitable for direct video display.
+   */
+  public void setYUVPad(int pad) throws Exception {
+    if(pad < 1 || ((pad & (pad - 1)) != 0))
+      throw new Exception("Invalid argument in setYUVPad()");
+    yuvPad = pad;
+  }
+
   /**
    * Encode the uncompressed source image associated with this compressor
    * instance and output a YUV planar image to the given destination buffer.
-   * This method uses the accelerated color conversion routines in
-   * TurboJPEG's underlying codec to produce a planar YUV image that is
-   * suitable for direct video display.  Specifically, if the chrominance
-   * components are subsampled along the horizontal dimension, then the width
-   * of the luminance plane is padded to the nearest multiple of 2 in the
-   * output image (same goes for the height of the luminance plane, if the
+   * This method uses the accelerated color conversion routines in TurboJPEG's
+   * underlying codec but does not execute any of the other steps in the JPEG
+   * compression process.  The Y, U (Cb), and V (Cr) image planes are stored
+   * sequentially into the destination buffer, and the size of each plane is
+   * determined by the width and height of the source image, as well as the
+   * specified padding and level of chrominance subsampling.  If the
+   * chrominance components are subsampled along the horizontal dimension, then
+   * the width of the luminance plane is padded to the nearest multiple of 2 in
+   * the output image (same goes for the height of the luminance plane, if the
    * chrominance components are subsampled along the vertical dimension.)
-   * Also, each line of each plane in the output image is padded to 4 bytes.
-   * Although this will work with any subsampling option, it is really only
-   * useful in combination with {@link TJ#SAMP_420}, which produces an image
-   * compatible with the I420 (AKA "YUV420P") format.
    * <p>
    * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
    * convention of the digital video community, the TurboJPEG API uses "YUV" to
@@ -362,9 +428,9 @@
       throw new Exception(NO_ASSOC_ERROR);
     if (subsamp < 0)
       throw new Exception("Subsampling level not set");
-    encodeYUV(srcBuf, srcWidth, srcPitch, srcHeight,
-              srcPixelFormat, dstBuf, subsamp, flags);
-    compressedSize = TJ.bufSizeYUV(srcWidth, srcHeight, subsamp);
+    encodeYUV(srcBuf, srcWidth, srcPitch, srcHeight, srcPixelFormat, dstBuf,
+              yuvPad, subsamp, flags);
+    compressedSize = TJ.bufSizeYUV(srcWidth, yuvPad, srcHeight, subsamp);
   }
 
   /**
@@ -381,7 +447,7 @@
       throw new Exception(NO_ASSOC_ERROR);
     if (subsamp < 0)
       throw new Exception("Subsampling level not set");
-    byte[] buf = new byte[TJ.bufSizeYUV(srcWidth, srcHeight, subsamp)];
+    byte[] buf = new byte[TJ.bufSizeYUV(srcWidth, yuvPad, srcHeight, subsamp)];
     encodeYUV(buf, flags);
     return buf;
   }
@@ -442,8 +508,8 @@
       int stride = sm.getScanlineStride();
       DataBufferInt db = (DataBufferInt)wr.getDataBuffer();
       int[] buf = db.getData();
-      encodeYUV(buf, width, stride, height, pixelFormat, dstBuf, subsamp,
-                flags);
+      encodeYUV(buf, width, stride, height, pixelFormat, dstBuf, yuvPad,
+                subsamp, flags);
     } else {
       ComponentSampleModel sm =
         (ComponentSampleModel)srcImage.getSampleModel();
@@ -453,10 +519,10 @@
       int pitch = sm.getScanlineStride();
       DataBufferByte db = (DataBufferByte)wr.getDataBuffer();
       byte[] buf = db.getData();
-      encodeYUV(buf, width, pitch, height, pixelFormat, dstBuf, subsamp,
-                flags);
+      encodeYUV(buf, width, pitch, height, pixelFormat, dstBuf, yuvPad,
+                subsamp, flags);
     }
-    compressedSize = TJ.bufSizeYUV(width, height, subsamp);
+    compressedSize = TJ.bufSizeYUV(width, yuvPad, height, subsamp);
   }
 
   /**
@@ -476,7 +542,7 @@
       throw new Exception("Subsampling level not set");
     int width = srcImage.getWidth();
     int height = srcImage.getHeight();
-    byte[] buf = new byte[TJ.bufSizeYUV(width, height, subsamp)];
+    byte[] buf = new byte[TJ.bufSizeYUV(width, yuvPad, height, subsamp)];
     encodeYUV(srcImage, buf, flags);
     return buf;
   }
@@ -529,13 +595,25 @@
     int stride, int height, int pixelFormat, byte[] dstBuf, int jpegSubsamp,
     int jpegQual, int flags) throws Exception;
 
+  private native int compressFromYUV(byte[] srcBuf, int width, int pad,
+    int height, int subsamp, byte[] dstBuf, int jpegQual, int flags)
+    throws Exception;
+
   private native void encodeYUV(byte[] srcBuf, int width, int pitch,
     int height, int pixelFormat, byte[] dstBuf, int subsamp, int flags)
-    throws Exception;
+    throws Exception; // deprecated
+
+  private native void encodeYUV(byte[] srcBuf, int width, int pitch,
+    int height, int pixelFormat, byte[] dstBuf, int pad, int subsamp,
+    int flags) throws Exception;
 
   private native void encodeYUV(int[] srcBuf, int width, int stride,
     int height, int pixelFormat, byte[] dstBuf, int subsamp, int flags)
-    throws Exception;
+    throws Exception; // deprecated
+
+  private native void encodeYUV(int[] srcBuf, int width, int pitch,
+    int height, int pixelFormat, byte[] dstBuf, int pad, int subsamp,
+    int flags) throws Exception;
 
   static {
     TJLoader.load();
@@ -549,8 +627,11 @@
   private int srcY = -1;
   private int srcPitch = 0;
   private int srcPixelFormat = -1;
+  private int srcYUVPad = -1;
+  private boolean srcIsYUV;
   private int subsamp = -1;
   private int jpegQuality = -1;
   private int compressedSize = 0;
+  private int yuvPad = 4;
   private ByteOrder byteOrder = null;
 };
diff --git a/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java b/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
index 5615b4e..bf78f2e 100644
--- a/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
+++ b/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java
@@ -39,7 +39,7 @@
   /**
    * A callback function that can be used to modify the DCT coefficients after
    * they are losslessly transformed but before they are transcoded to a new
-   * JPEG file.  This allows for custom filters or other transformations to be
+   * JPEG image.  This allows for custom filters or other transformations to be
    * applied in the frequency domain.
    *
    * @param coeffBuffer a buffer containing transformed DCT coefficients.
diff --git a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
index 46d1d5f..d14a989 100644
--- a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
@@ -117,7 +117,7 @@
 
   /**
    * Returns the level of chrominance subsampling used in the JPEG image
-   * associated with this decompressor instance.
+   * associated with this decompressor instance.  See {@link TJ TJ.SAMP_*}.
    *
    * @return the level of chrominance subsampling used in the JPEG image
    * associated with this decompressor instance
@@ -131,6 +131,21 @@
   }
 
   /**
+   * Returns the colorspace used in the JPEG image associated with this
+   * decompressor instance.  See {@link TJ TJ.CS_*}.
+   *
+   * @return the colorspace used in the JPEG image associated with this
+   * decompressor instance
+   */
+  public int getColorspace() throws Exception {
+    if (jpegColorspace < 0)
+      throw new Exception(NO_ASSOC_ERROR);
+    if (jpegColorspace >= TJ.NUMCS)
+      throw new Exception("JPEG header information is invalid");
+    return jpegColorspace;
+  }
+
+  /**
    * Returns the JPEG image buffer associated with this decompressor instance.
    *
    * @return the JPEG image buffer associated with this decompressor instance
@@ -313,6 +328,7 @@
    * @deprecated Use
    * {@link #decompress(byte[], int, int, int, int, int, int, int)} instead.
    */
+  @Deprecated
   public void decompress(byte[] dstBuf, int desiredWidth, int pitch,
                          int desiredHeight, int pixelFormat, int flags)
                          throws Exception {
@@ -377,39 +393,91 @@
    * {@link TJ#bufSizeYUV} to determine the appropriate size for this buffer
    * based on the image width, height, and level of chrominance subsampling.
    *
+   * @param desiredWidth desired width (in pixels) of the YUV image.  If the
+   * desired image dimensions are different than the dimensions of the JPEG
+   * image being decompressed, then TurboJPEG will use scaling in the JPEG
+   * decompressor to generate the largest possible image that will fit within
+   * the desired dimensions.  Setting this to 0 is the same as setting it to
+   * the width of the JPEG image (in other words, the width will not be
+   * considered when determining the scaled image size.)
+   *
+   * @param pad the width of each line in each plane of the YUV image will be
+   * padded to the nearest multiple of this number of bytes (must be a power of
+   * 2.)
+   *
+   * @param desiredHeight desired height (in pixels) of the YUV image.  If the
+   * desired image dimensions are different than the dimensions of the JPEG
+   * image being decompressed, then TurboJPEG will use scaling in the JPEG
+   * decompressor to generate the largest possible image that will fit within
+   * the desired dimensions.  Setting this to 0 is the same as setting it to
+   * the height of the JPEG image (in other words, the height will not be
+   * considered when determining the scaled image size.)
+   *
    * @param flags the bitwise OR of one or more of {@link TJ TJ.FLAG_*}
    */
-  public void decompressToYUV(byte[] dstBuf, int flags) throws Exception {
+  public void decompressToYUV(byte[] dstBuf, int desiredWidth, int pad,
+                              int desiredHeight, int flags) throws Exception {
     if (jpegBuf == null)
       throw new Exception(NO_ASSOC_ERROR);
-    if (dstBuf == null || flags < 0)
+    if (dstBuf == null || desiredWidth < 0 || pad < 1 ||
+        ((pad & (pad - 1)) != 0) || desiredHeight < 0 || flags < 0)
       throw new Exception("Invalid argument in decompressToYUV()");
-    decompressToYUV(jpegBuf, jpegBufSize, dstBuf, flags);
+    decompressToYUV(jpegBuf, jpegBufSize, dstBuf, desiredWidth, pad,
+                    desiredHeight, flags);
   }
 
+  /**
+   * @deprecated Use {@link #decompressToYUV(byte[], int, int, int, int)}
+   * instead.
+   */
+  @Deprecated
+  public void decompressToYUV(byte[] dstBuf, int flags) throws Exception {
+    decompressToYUV(dstBuf, 0, 4, 0, flags);
+  }
 
   /**
    * Decompress the JPEG source image associated with this decompressor
    * instance and return a buffer containing a YUV planar image.  See {@link
-   * #decompressToYUV(byte[], int)} for more detail.
+   * #decompressToYUV(byte[], int, int, int, int)} for more detail.
+   *
+   * @param desiredWidth see
+   * {@link #decompressToYUV(byte[], int, int, int, int)} for description
+   *
+   * @param pad see {@link #decompressToYUV(byte[], int, int, int, int)} for
+   * description
+   *
+   * @param desiredHeight see {@link
+   * #decompressToYUV(byte[], int, int, int, int)} for description
    *
    * @param flags the bitwise OR of one or more of {@link TJ TJ.FLAG_*}
    *
    * @return a buffer containing a YUV planar image
    */
-  public byte[] decompressToYUV(int flags) throws Exception {
+  public byte[] decompressToYUV(int desiredWidth, int pad, int desiredHeight,
+                                int flags) throws Exception {
     if (flags < 0)
       throw new Exception("Invalid argument in decompressToYUV()");
     if (jpegWidth < 1 || jpegHeight < 1 || jpegSubsamp < 0)
       throw new Exception(NO_ASSOC_ERROR);
     if (jpegSubsamp >= TJ.NUMSAMP)
       throw new Exception("JPEG header information is invalid");
-    byte[] buf = new byte[TJ.bufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp)];
-    decompressToYUV(buf, flags);
+    int scaledWidth = getScaledWidth(desiredWidth, desiredHeight);
+    int scaledHeight = getScaledHeight(desiredWidth, desiredHeight);
+    byte[] buf = new byte[TJ.bufSizeYUV(scaledWidth, pad, scaledHeight,
+                                        jpegSubsamp)];
+    decompressToYUV(buf, desiredWidth, pad, desiredHeight, flags);
     return buf;
   }
 
   /**
+   * @deprecated Use {@link #decompressToYUV(int, int, int, int)} instead.
+   */
+  @Deprecated
+  public byte[] decompressToYUV(int flags) throws Exception {
+    return decompressToYUV(0, 4, 0, flags);
+  }
+
+  /**
    * Decompress the JPEG source image associated with this decompressor
    * instance and output a decompressed image to the given destination buffer.
    *
@@ -623,7 +691,10 @@
     int flags) throws Exception;
 
   private native void decompressToYUV(byte[] srcBuf, int size, byte[] dstBuf,
-    int flags) throws Exception;
+    int flags) throws Exception; // deprecated
+
+  private native void decompressToYUV(byte[] srcBuf, int size, byte[] dstBuf,
+    int desiredWidth, int pad, int desiredheight, int flags) throws Exception;
 
   static {
     TJLoader.load();
@@ -635,5 +706,6 @@
   protected int jpegWidth = 0;
   protected int jpegHeight = 0;
   protected int jpegSubsamp = -1;
+  protected int jpegColorspace = -1;
   private ByteOrder byteOrder = null;
 };
diff --git a/java/org_libjpegturbo_turbojpeg_TJ.h b/java/org_libjpegturbo_turbojpeg_TJ.h
index d7b032a..b00a128 100644
--- a/java/org_libjpegturbo_turbojpeg_TJ.h
+++ b/java/org_libjpegturbo_turbojpeg_TJ.h
@@ -8,7 +8,7 @@
 extern "C" {
 #endif
 #undef org_libjpegturbo_turbojpeg_TJ_NUMSAMP
-#define org_libjpegturbo_turbojpeg_TJ_NUMSAMP 5L
+#define org_libjpegturbo_turbojpeg_TJ_NUMSAMP 6L
 #undef org_libjpegturbo_turbojpeg_TJ_SAMP_444
 #define org_libjpegturbo_turbojpeg_TJ_SAMP_444 0L
 #undef org_libjpegturbo_turbojpeg_TJ_SAMP_422
@@ -19,8 +19,10 @@
 #define org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY 3L
 #undef org_libjpegturbo_turbojpeg_TJ_SAMP_440
 #define org_libjpegturbo_turbojpeg_TJ_SAMP_440 4L
+#undef org_libjpegturbo_turbojpeg_TJ_SAMP_411
+#define org_libjpegturbo_turbojpeg_TJ_SAMP_411 5L
 #undef org_libjpegturbo_turbojpeg_TJ_NUMPF
-#define org_libjpegturbo_turbojpeg_TJ_NUMPF 11L
+#define org_libjpegturbo_turbojpeg_TJ_NUMPF 12L
 #undef org_libjpegturbo_turbojpeg_TJ_PF_RGB
 #define org_libjpegturbo_turbojpeg_TJ_PF_RGB 0L
 #undef org_libjpegturbo_turbojpeg_TJ_PF_BGR
@@ -43,16 +45,22 @@
 #define org_libjpegturbo_turbojpeg_TJ_PF_ABGR 9L
 #undef org_libjpegturbo_turbojpeg_TJ_PF_ARGB
 #define org_libjpegturbo_turbojpeg_TJ_PF_ARGB 10L
+#undef org_libjpegturbo_turbojpeg_TJ_PF_CMYK
+#define org_libjpegturbo_turbojpeg_TJ_PF_CMYK 11L
+#undef org_libjpegturbo_turbojpeg_TJ_NUMCS
+#define org_libjpegturbo_turbojpeg_TJ_NUMCS 5L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_RGB
+#define org_libjpegturbo_turbojpeg_TJ_CS_RGB 0L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_YCbCr
+#define org_libjpegturbo_turbojpeg_TJ_CS_YCbCr 1L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_GRAY
+#define org_libjpegturbo_turbojpeg_TJ_CS_GRAY 2L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_CMYK
+#define org_libjpegturbo_turbojpeg_TJ_CS_CMYK 3L
+#undef org_libjpegturbo_turbojpeg_TJ_CS_YCCK
+#define org_libjpegturbo_turbojpeg_TJ_CS_YCCK 4L
 #undef org_libjpegturbo_turbojpeg_TJ_FLAG_BOTTOMUP
 #define org_libjpegturbo_turbojpeg_TJ_FLAG_BOTTOMUP 2L
-#undef org_libjpegturbo_turbojpeg_TJ_FLAG_FORCEMMX
-#define org_libjpegturbo_turbojpeg_TJ_FLAG_FORCEMMX 8L
-#undef org_libjpegturbo_turbojpeg_TJ_FLAG_FORCESSE
-#define org_libjpegturbo_turbojpeg_TJ_FLAG_FORCESSE 16L
-#undef org_libjpegturbo_turbojpeg_TJ_FLAG_FORCESSE2
-#define org_libjpegturbo_turbojpeg_TJ_FLAG_FORCESSE2 32L
-#undef org_libjpegturbo_turbojpeg_TJ_FLAG_FORCESSE3
-#define org_libjpegturbo_turbojpeg_TJ_FLAG_FORCESSE3 128L
 #undef org_libjpegturbo_turbojpeg_TJ_FLAG_FASTUPSAMPLE
 #define org_libjpegturbo_turbojpeg_TJ_FLAG_FASTUPSAMPLE 256L
 #undef org_libjpegturbo_turbojpeg_TJ_FLAG_FASTDCT
@@ -70,9 +78,17 @@
 /*
  * Class:     org_libjpegturbo_turbojpeg_TJ
  * Method:    bufSizeYUV
+ * Signature: (IIII)I
+ */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
+  (JNIEnv *, jclass, jint, jint, jint, jint);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJ
+ * Method:    bufSizeYUV
  * Signature: (III)I
  */
-JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III
   (JNIEnv *, jclass, jint, jint, jint);
 
 /*
diff --git a/java/org_libjpegturbo_turbojpeg_TJCompressor.h b/java/org_libjpegturbo_turbojpeg_TJCompressor.h
index 2fc9136..50070ef 100644
--- a/java/org_libjpegturbo_turbojpeg_TJCompressor.h
+++ b/java/org_libjpegturbo_turbojpeg_TJCompressor.h
@@ -57,6 +57,14 @@
 
 /*
  * Class:     org_libjpegturbo_turbojpeg_TJCompressor
+ * Method:    compressFromYUV
+ * Signature: ([BIIII[BII)I
+ */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3BIIII_3BII
+  (JNIEnv *, jobject, jbyteArray, jint, jint, jint, jint, jbyteArray, jint, jint);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJCompressor
  * Method:    encodeYUV
  * Signature: ([BIIII[BII)V
  */
@@ -66,11 +74,27 @@
 /*
  * Class:     org_libjpegturbo_turbojpeg_TJCompressor
  * Method:    encodeYUV
+ * Signature: ([BIIII[BIII)V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BIII
+  (JNIEnv *, jobject, jbyteArray, jint, jint, jint, jint, jbyteArray, jint, jint, jint);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJCompressor
+ * Method:    encodeYUV
  * Signature: ([IIIII[BII)V
  */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
   (JNIEnv *, jobject, jintArray, jint, jint, jint, jint, jbyteArray, jint, jint);
 
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJCompressor
+ * Method:    encodeYUV
+ * Signature: ([IIIII[BIII)V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BIII
+  (JNIEnv *, jobject, jintArray, jint, jint, jint, jint, jbyteArray, jint, jint, jint);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/java/org_libjpegturbo_turbojpeg_TJDecompressor.h b/java/org_libjpegturbo_turbojpeg_TJDecompressor.h
index f798a77..203f004 100644
--- a/java/org_libjpegturbo_turbojpeg_TJDecompressor.h
+++ b/java/org_libjpegturbo_turbojpeg_TJDecompressor.h
@@ -68,9 +68,17 @@
  * Method:    decompressToYUV
  * Signature: ([BI[BI)V
  */
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI
   (JNIEnv *, jobject, jbyteArray, jint, jbyteArray, jint);
 
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJDecompressor
+ * Method:    decompressToYUV
+ * Signature: ([BI[BIIII)V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BIIII
+  (JNIEnv *, jobject, jbyteArray, jint, jbyteArray, jint, jint, jint, jint);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/jcparam.c b/jcparam.c
index 2b9a740..f4e5eec 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -16,6 +16,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jstdhuff.c"
 
 
 /*
@@ -166,116 +167,6 @@
 
 
 /*
- * Huffman table setup routines
- */
-
-LOCAL(void)
-add_huff_table (j_compress_ptr cinfo,
-		JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val)
-/* Define a Huffman table */
-{
-  int nsymbols, len;
-
-  if (*htblptr == NULL)
-    *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-
-  /* Copy the number-of-symbols-of-each-code-length counts */
-  MEMCOPY((*htblptr)->bits, bits, SIZEOF((*htblptr)->bits));
-
-  /* Validate the counts.  We do this here mainly so we can copy the right
-   * number of symbols from the val[] array, without risking marching off
-   * the end of memory.  jchuff.c will do a more thorough test later.
-   */
-  nsymbols = 0;
-  for (len = 1; len <= 16; len++)
-    nsymbols += bits[len];
-  if (nsymbols < 1 || nsymbols > 256)
-    ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
-
-  MEMCOPY((*htblptr)->huffval, val, nsymbols * SIZEOF(UINT8));
-
-  /* Initialize sent_table FALSE so table will be written to JPEG file. */
-  (*htblptr)->sent_table = FALSE;
-}
-
-
-LOCAL(void)
-std_huff_tables (j_compress_ptr cinfo)
-/* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
-/* IMPORTANT: these are only valid for 8-bit data precision! */
-{
-  static const UINT8 bits_dc_luminance[17] =
-    { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_luminance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
-  
-  static const UINT8 bits_dc_chrominance[17] =
-    { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_chrominance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
-  
-  static const UINT8 bits_ac_luminance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
-  static const UINT8 val_ac_luminance[] =
-    { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
-      0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
-      0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
-      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
-      0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
-      0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
-      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-      0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-      0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-      0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-      0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-      0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
-      0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
-      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
-      0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
-      0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
-      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
-      0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
-  
-  static const UINT8 bits_ac_chrominance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
-  static const UINT8 val_ac_chrominance[] =
-    { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
-      0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
-      0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
-      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
-      0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
-      0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
-      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
-      0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
-      0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
-      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
-      0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-      0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
-      0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
-      0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
-      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
-      0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
-      0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
-      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
-      0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
-  
-  add_huff_table(cinfo, &cinfo->dc_huff_tbl_ptrs[0],
-		 bits_dc_luminance, val_dc_luminance);
-  add_huff_table(cinfo, &cinfo->ac_huff_tbl_ptrs[0],
-		 bits_ac_luminance, val_ac_luminance);
-  add_huff_table(cinfo, &cinfo->dc_huff_tbl_ptrs[1],
-		 bits_dc_chrominance, val_dc_chrominance);
-  add_huff_table(cinfo, &cinfo->ac_huff_tbl_ptrs[1],
-		 bits_ac_chrominance, val_ac_chrominance);
-}
-
-
-/*
  * Default parameter setup for compression.
  *
  * Applications that don't choose to use this routine must do their
@@ -313,7 +204,7 @@
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
   /* Set up two Huffman tables */
-  std_huff_tables(cinfo);
+  std_huff_tables((j_common_ptr) cinfo);
 
   /* Initialize default arithmetic coding conditioning */
   for (i = 0; i < NUM_ARITH_TBLS; i++) {
diff --git a/jddctmgr.c b/jddctmgr.c
index 0a5decb..88b4707 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -133,6 +133,11 @@
       method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
     case 6:
+#if defined(__mips__)
+      if (jsimd_can_idct_6x6())
+        method_ptr = jsimd_idct_6x6;
+      else
+#endif
       method_ptr = jpeg_idct_6x6;
       method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
@@ -188,6 +193,11 @@
       method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
     case 12:
+#if defined(__mips__)
+      if (jsimd_can_idct_12x12())
+        method_ptr = jsimd_idct_12x12;
+      else
+#endif
       method_ptr = jpeg_idct_12x12;
       method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
diff --git a/jdhuff.c b/jdhuff.c
index dba5f18..d21d399 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -21,6 +21,7 @@
 #include "jpeglib.h"
 #include "jdhuff.h"		/* Declarations shared with jdphuff.c */
 #include "jpegcomp.h"
+#include "jstdhuff.c"
 
 
 /*
@@ -795,6 +796,12 @@
   huff_entropy_ptr entropy;
   int i;
 
+  /* Motion JPEG frames typically do not include the Huffman tables if they
+     are the default tables.  Thus, if the tables are not set by the time
+     the Huffman decoder is initialized (usually within the body of
+     jpeg_start_decompress()), we set them to default values. */
+  std_huff_tables((j_common_ptr) cinfo);
+
   entropy = (huff_entropy_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				SIZEOF(huff_entropy_decoder));
diff --git a/jsimd_none.c b/jsimd_none.c
index 9787902..882fc08 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -258,6 +258,18 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_idct_6x6 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12 (void)
+{
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
@@ -272,6 +284,20 @@
 {
 }
 
+GLOBAL(void)
+jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+}
+
 GLOBAL(int)
 jsimd_can_idct_islow (void)
 {
diff --git a/jsimddct.h b/jsimddct.h
index a1c7440..9d2d945 100644
--- a/jsimddct.h
+++ b/jsimddct.h
@@ -68,6 +68,8 @@
 
 EXTERN(int) jsimd_can_idct_2x2 JPP((void));
 EXTERN(int) jsimd_can_idct_4x4 JPP((void));
+EXTERN(int) jsimd_can_idct_6x6 JPP((void));
+EXTERN(int) jsimd_can_idct_12x12 JPP((void));
 
 EXTERN(void) jsimd_idct_2x2 JPP((j_decompress_ptr cinfo,
                                  jpeg_component_info * compptr,
@@ -79,6 +81,16 @@
                                  JCOEFPTR coef_block,
                                  JSAMPARRAY output_buf,
                                  JDIMENSION output_col));
+EXTERN(void) jsimd_idct_6x6 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+EXTERN(void) jsimd_idct_12x12 JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
 
 EXTERN(int) jsimd_can_idct_islow JPP((void));
 EXTERN(int) jsimd_can_idct_ifast JPP((void));
diff --git a/jstdhuff.c b/jstdhuff.c
new file mode 100644
index 0000000..7fec6ca
--- /dev/null
+++ b/jstdhuff.c
@@ -0,0 +1,133 @@
+/*
+* jstdhuff.c
+*
+* This file was part of the Independent JPEG Group's software:
+* Copyright (C) 1991-1998, Thomas G. Lane.
+* libjpeg-turbo Modifications:
+* Copyright (C) 2013, D. R. Commander.
+* For conditions of distribution and use, see the accompanying README file.
+*
+* This file contains routines to set the default Huffman tables, if they are
+* not already set.
+*/
+
+/*
+ * Huffman table setup routines
+ */
+
+LOCAL(void)
+add_huff_table (j_common_ptr cinfo,
+		JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val)
+/* Define a Huffman table */
+{
+  int nsymbols, len;
+
+  if (*htblptr == NULL)
+    *htblptr = jpeg_alloc_huff_table(cinfo);
+  else
+    return;
+
+  /* Copy the number-of-symbols-of-each-code-length counts */
+  MEMCOPY((*htblptr)->bits, bits, SIZEOF((*htblptr)->bits));
+
+  /* Validate the counts.  We do this here mainly so we can copy the right
+   * number of symbols from the val[] array, without risking marching off
+   * the end of memory.  jchuff.c will do a more thorough test later.
+   */
+  nsymbols = 0;
+  for (len = 1; len <= 16; len++)
+    nsymbols += bits[len];
+  if (nsymbols < 1 || nsymbols > 256)
+    ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+
+  MEMCOPY((*htblptr)->huffval, val, nsymbols * SIZEOF(UINT8));
+
+  /* Initialize sent_table FALSE so table will be written to JPEG file. */
+  (*htblptr)->sent_table = FALSE;
+}
+
+
+LOCAL(void)
+std_huff_tables (j_common_ptr cinfo)
+/* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
+/* IMPORTANT: these are only valid for 8-bit data precision! */
+{
+  JHUFF_TBL **dc_huff_tbl_ptrs, **ac_huff_tbl_ptrs;
+
+  static const UINT8 bits_dc_luminance[17] =
+    { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
+  static const UINT8 val_dc_luminance[] =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+
+  static const UINT8 bits_dc_chrominance[17] =
+    { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
+  static const UINT8 val_dc_chrominance[] =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+
+  static const UINT8 bits_ac_luminance[17] =
+    { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
+  static const UINT8 val_ac_luminance[] =
+    { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+      0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+      0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+      0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+      0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+      0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+      0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+      0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+      0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+      0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+      0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+      0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+      0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+      0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+      0xf9, 0xfa };
+
+  static const UINT8 bits_ac_chrominance[17] =
+    { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
+  static const UINT8 val_ac_chrominance[] =
+    { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+      0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+      0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+      0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+      0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+      0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+      0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+      0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+      0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+      0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+      0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+      0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+      0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+      0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+      0xf9, 0xfa };
+
+  if (cinfo->is_decompressor) {
+    dc_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->dc_huff_tbl_ptrs;
+    ac_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->ac_huff_tbl_ptrs;
+  } else {
+    dc_huff_tbl_ptrs = ((j_compress_ptr)cinfo)->dc_huff_tbl_ptrs;
+    ac_huff_tbl_ptrs = ((j_compress_ptr)cinfo)->ac_huff_tbl_ptrs;
+  }
+
+  add_huff_table(cinfo, &dc_huff_tbl_ptrs[0], bits_dc_luminance,
+                 val_dc_luminance);
+  add_huff_table(cinfo, &ac_huff_tbl_ptrs[0], bits_ac_luminance,
+                 val_ac_luminance);
+  add_huff_table(cinfo, &dc_huff_tbl_ptrs[1], bits_dc_chrominance,
+                 val_dc_chrominance);
+  add_huff_table(cinfo, &ac_huff_tbl_ptrs[1], bits_ac_chrominance,
+                 val_ac_chrominance);
+}
diff --git a/jversion.h b/jversion.h
index c37651b..032fef0 100644
--- a/jversion.h
+++ b/jversion.h
@@ -29,4 +29,6 @@
 			"Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
 			"Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
 			"Copyright (C) 2009-2014 D. R. Commander\n" \
-			"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)"
+			"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+			"Copyright (C) 2013 MIPS Technologies, Inc.\n" \
+			"Copyright (C) 2013 Linaro Limited"
diff --git a/release/libjpeg-turbo.nsi.in b/release/libjpeg-turbo.nsi.in
index 4f65303..0514565 100755
--- a/release/libjpeg-turbo.nsi.in
+++ b/release/libjpeg-turbo.nsi.in
@@ -1,7 +1,7 @@
 !include x64.nsh
 Name "@CMAKE_PROJECT_NAME@ SDK for @INST_PLATFORM@"
 OutFile "@CMAKE_BINARY_DIR@\${BUILDDIR}@INST_NAME@.exe"
-InstallDir "@INST_DIR@"
+InstallDir @INST_DIR@
 
 SetCompressor bzip2
 
@@ -111,15 +111,13 @@
 !ifdef GCC
 	Delete $INSTDIR\bin\libjpeg-@DLL_VERSION@.dll
 	Delete $SYSDIR\libturbojpeg.dll
-	Delete $INSTDIR\bin\libturbojpeg.dll
-	Delete $INSTDIR\lib\libturbojpeg.dll.a
-	Delete $INSTDIR\lib\libturbojpeg.a
-	Delete $INSTDIR\lib\libjpeg.dll.a
-	Delete $INSTDIR\lib\libjpeg.a
+	Delete $INSTDIR\lib\libturbojpeg.dll.a"
+	Delete $INSTDIR\lib\libturbojpeg.a"
+	Delete $INSTDIR\lib\libjpeg.dll.a"
+	Delete $INSTDIR\lib\libjpeg.a"
 !else
 	Delete $INSTDIR\bin\jpeg@DLL_VERSION@.dll
 	Delete $SYSDIR\turbojpeg.dll
-	Delete $INSTDIR\bin\turbojpeg.dll
 	Delete $INSTDIR\lib\jpeg.lib
 	Delete $INSTDIR\lib\jpeg-static.lib
 	Delete $INSTDIR\lib\turbojpeg.lib
@@ -134,11 +132,11 @@
 	Delete $INSTDIR\bin\tjbench.exe
 	Delete $INSTDIR\bin\rdjpgcom.exe
 	Delete $INSTDIR\bin\wrjpgcom.exe
-	Delete $INSTDIR\include\jconfig.h
-	Delete $INSTDIR\include\jerror.h
-	Delete $INSTDIR\include\jmorecfg.h
-	Delete $INSTDIR\include\jpeglib.h
-	Delete $INSTDIR\include\turbojpeg.h
+	Delete $INSTDIR\include\jconfig.h"
+	Delete $INSTDIR\include\jerror.h"
+	Delete $INSTDIR\include\jmorecfg.h"
+	Delete $INSTDIR\include\jpeglib.h"
+	Delete $INSTDIR\include\turbojpeg.h"
 	Delete $INSTDIR\uninstall_@VERSION@.exe
 	Delete $INSTDIR\doc\README
 	Delete $INSTDIR\doc\README-turbo.txt
diff --git a/release/libjpeg-turbo.spec.in b/release/libjpeg-turbo.spec.in
index 6c97814..616618d 100644
--- a/release/libjpeg-turbo.spec.in
+++ b/release/libjpeg-turbo.spec.in
@@ -124,7 +124,7 @@
 %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@
 %{_libdir}/libjpeg.so
 %{_libdir}/libjpeg.a
-%{_libdir}/libturbojpeg.so.0.0.0
+%{_libdir}/libturbojpeg.so.0.1.0
 %{_libdir}/libturbojpeg.so.0
 %{_libdir}/libturbojpeg.so
 %{_libdir}/libturbojpeg.a
diff --git a/simd/Makefile.am b/simd/Makefile.am
index a12ff6e..be2f0a5 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -58,6 +58,18 @@
 
 endif
 
+if SIMD_ARM_64
+
+libsimd_la_SOURCES = jsimd_arm64.c jsimd_arm_neon_64.S
+
+endif
+
+if SIMD_MIPSEL
+
+libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2_asm.h jsimd_mips_dspr2.S
+
+endif
+
 AM_CPPFLAGS = -I$(top_srcdir) 
 
 .asm.lo:
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 3d4751f..f75aa9b 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -3,7 +3,8 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2011 D. R. Commander
- * 
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
+ *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
@@ -18,6 +19,7 @@
 #define JSIMD_SSE        0x04
 #define JSIMD_SSE2       0x08
 #define JSIMD_ARM_NEON   0x10
+#define JSIMD_MIPS_DSPR2 0x20
 
 /* Short forms of external names for systems with brain-damaged linkers. */
 
@@ -386,6 +388,93 @@
              JSAMPIMAGE input_buf, JDIMENSION input_row,
              JSAMPARRAY output_buf, int num_rows));
 
+EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_gray_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_gray_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_gray_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_gray_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_gray_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN (void) jsimd_ycc_rgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
+        JPP((JDIMENSION img_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
 /* SIMD Downsample */
 EXTERN(void) jsimd_h2v2_downsample_mmx
         JPP((JDIMENSION image_width, int max_v_samp_factor,
@@ -405,6 +494,15 @@
              JDIMENSION v_samp_factor, JDIMENSION width_blocks,
              JSAMPARRAY input_data, JSAMPARRAY output_data));
 
+EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
 /* SIMD Upsample */
 EXTERN(void) jsimd_h2v2_upsample_mmx
         JPP((int max_v_samp_factor, JDIMENSION output_width,
@@ -526,6 +624,20 @@
         JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
              JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 
+EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_upsample_mips_dspr2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
 /* SIMD Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
                                      JDIMENSION start_col,
@@ -539,6 +651,10 @@
                                       JDIMENSION start_col,
                                       DCTELEM * workspace));
 
+EXTERN(void) jsimd_convsamp_mips_dspr2 JPP((JSAMPARRAY sample_data,
+                                            JDIMENSION start_col,
+                                            DCTELEM * workspace));
+
 EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data,
                                              JDIMENSION start_col,
                                              FAST_FLOAT * workspace));
@@ -551,6 +667,10 @@
                                             JDIMENSION start_col,
                                             FAST_FLOAT * workspace));
 
+EXTERN(void) jsimd_convsamp_float_mips_dspr2 JPP((JSAMPARRAY sample_data,
+                                                  JDIMENSION start_col,
+                                                  FAST_FLOAT * workspace));
+
 /* SIMD Forward DCT */
 EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
 EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
@@ -562,6 +682,9 @@
 
 EXTERN(void) jsimd_fdct_ifast_neon JPP((DCTELEM * data));
 
+EXTERN(void) jsimd_fdct_islow_mips_dspr2 JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast_mips_dspr2 JPP((DCTELEM * data));
+
 EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
 
 extern const int jconst_fdct_float_sse[];
@@ -580,6 +703,10 @@
                                       DCTELEM * divisors,
                                       DCTELEM * workspace));
 
+EXTERN(void) jsimd_quantize_mips_dspr2 JPP((JCOEFPTR coef_block,
+                                            DCTELEM * divisors,
+                                            DCTELEM * workspace));
+
 EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
                                              FAST_FLOAT * divisors,
                                              FAST_FLOAT * workspace));
@@ -592,6 +719,10 @@
                                             FAST_FLOAT * divisors,
                                             FAST_FLOAT * workspace));
 
+EXTERN(void) jsimd_quantize_float_mips_dspr2 JPP((JCOEFPTR coef_block,
+                                                  FAST_FLOAT * divisors,
+                                                  FAST_FLOAT * workspace));
+
 /* SIMD Reduced Inverse DCT */
 EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
                                      JCOEFPTR coef_block,
@@ -621,6 +752,25 @@
                                       JSAMPARRAY output_buf,
                                       JDIMENSION output_col));
 
+EXTERN(void) jsimd_idct_2x2_mips_dspr2 JPP((void * dct_table,
+                                            JCOEFPTR coef_block,
+                                            JSAMPARRAY output_buf,
+                                            JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_mips_dspr2 JPP((void * dct_table,
+                                            JCOEFPTR coef_block,
+                                            JSAMPARRAY output_buf,
+                                            JDIMENSION output_col,
+                                            int * workspace));
+EXTERN(void) jsimd_idct_6x6_mips_dspr2 JPP((void * dct_table,
+                                            JCOEFPTR coef_block,
+                                            JSAMPARRAY output_buf,
+                                            JDIMENSION output_col));
+EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2 JPP((JCOEFPTR coef_block,
+                                                    void * dct_table,
+                                                    int * workspace));
+EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2 JPP((int * workspace,
+                                                    int * output));
+
 /* SIMD Inverse DCT */
 EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
                                        JCOEFPTR coef_block,
@@ -651,6 +801,15 @@
                                         JSAMPARRAY output_buf,
                                         JDIMENSION output_col));
 
+EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2 JPP((JCOEF * inptr,
+                                                   IFAST_MULT_TYPE * quantptr,
+                                                   DCTELEM * wsptr,
+                                                   const int * idct_coefs));
+EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2 JPP((DCTELEM * wsptr,
+                                                   JSAMPARRAY output_buf,
+                                                   JDIMENSION output_col,
+                                                   const int * idct_coefs));
+
 EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
                                          JCOEFPTR coef_block,
                                          JSAMPARRAY output_buf,
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
index bd717a4..c6fa4ca 100644
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -2,7 +2,7 @@
  * jsimd_arm.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011 D. R. Commander
+ * Copyright 2009-2011, 2013 D. R. Commander
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -123,10 +123,10 @@
 #endif
 
   /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCE_ARM_NEON");
+  env = getenv("JSIMD_FORCENEON");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support &= JSIMD_ARM_NEON;
-  env = getenv("JSIMD_FORCE_NO_SIMD");
+  env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
 }
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c
new file mode 100644
index 0000000..d280d62
--- /dev/null
+++ b/simd/jsimd_arm64.c
@@ -0,0 +1,586 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011, 2013-2014 D. R. Commander
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on
+ * ARM architecture.
+ *
+ * Based on the stubs from 'jsimd_none.c'
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature (char *buffer, char *feature)
+{
+  char *p;
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "Features", 8) != 0)
+    return 0;
+  buffer += 8;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo (int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "neon"))
+        simd_support |= JSIMD_ARM_NEON;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  char *env = NULL;
+#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ARM_NEON__)
+  simd_support |= JSIMD_ARM_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  /* We still have a chance to use NEON regardless of globally used
+   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux/android */
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENEON");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_ARM_NEON;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      neonfct=jsimd_ycc_extrgb_convert_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct=jsimd_ycc_extrgbx_convert_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct=jsimd_ycc_extbgr_convert_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct=jsimd_ycc_extbgrx_convert_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct=jsimd_ycc_extxbgr_convert_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct=jsimd_ycc_extxrgb_convert_neon;
+      break;
+  default:
+      neonfct=jsimd_ycc_extrgb_convert_neon;
+      break;
+  }
+
+  if (simd_support & JSIMD_ARM_NEON)
+    neonfct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_ARM_NEON))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_ARM_NEON))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_ARM_NEON))
+    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_ARM_NEON))
+    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_ARM_NEON))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_ARM_NEON))
+    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_ARM_NEON))
+    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S
new file mode 100644
index 0000000..2d9e95e
--- /dev/null
+++ b/simd/jsimd_arm_neon_64.S
@@ -0,0 +1,1832 @@
+/*
+ * ARMv8 NEON optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
+ * All rights reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2013, Linaro Limited
+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+#endif
+
+.text
+.arch armv8-a+fp+simd
+
+
+#define RESPECT_STRICT_ALIGNMENT 1
+
+#define RTSM_SQSHRN_SIM_ISSUE
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .func _\fname
+    .globl _\fname
+_\fname:
+#else
+    .func \fname
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+/* Transpose elements of single 128 bit registers */
+.macro transpose_single x0,x1,xi,xilen,literal
+    ins  \xi\xilen[0],  \x0\xilen[0]
+    ins  \x1\xilen[0],  \x0\xilen[1]
+    trn1 \x0\literal,   \x0\literal, \x1\literal
+    trn2 \x1\literal,   \xi\literal, \x1\literal
+.endm
+
+/* Transpose elements of 2 differnet registers */
+.macro transpose x0,x1,xi,xilen,literal
+    mov  \xi\xilen,     \x0\xilen
+    trn1 \x0\literal,   \x0\literal, \x1\literal
+    trn2 \x1\literal,   \xi\literal, \x1\literal
+.endm
+
+/* Transpose a block of 4x4 coefficients in four 64-bit registers */
+.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
+    mov  \xi\xilen, \x0\xilen
+    trn1 \x0\x0len, \x0\x0len, \x2\x2len
+    trn2 \x2\x2len, \xi\x0len, \x2\x2len
+    mov  \xi\xilen, \x1\xilen
+    trn1 \x1\x1len, \x1\x1len, \x3\x3len
+    trn2 \x3\x3len, \xi\x1len, \x3\x3len
+.endm
+
+.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
+    mov  \xi\xilen, \x0\xilen
+    trn1 \x0\x0len, \x0\x0len, \x1\x1len
+    trn2 \x1\x2len, \xi\x0len, \x1\x2len
+    mov  \xi\xilen, \x2\xilen
+    trn1 \x2\x2len, \x2\x2len, \x3\x3len
+    trn2 \x3\x2len, \xi\x1len, \x3\x3len
+.endm
+
+.macro transpose_4x4 x0, x1, x2, x3,x5
+    transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
+    transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
+.endm
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
+ *                        JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336  (2446)
+#define FIX_0_390180644  (3196)
+#define FIX_0_541196100  (4433)
+#define FIX_0_765366865  (6270)
+#define FIX_0_899976223  (7373)
+#define FIX_1_175875602  (9633)
+#define FIX_1_501321110  (12299)
+#define FIX_1_847759065  (15137)
+#define FIX_1_961570560  (16069)
+#define FIX_2_053119869  (16819)
+#define FIX_2_562915447  (20995)
+#define FIX_3_072711026  (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
+{                                                                             \
+    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
+    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
+    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
+                                                                              \
+    /* 1-D iDCT input data */                                                 \
+    row0 = xrow0;                                                             \
+    row1 = xrow1;                                                             \
+    row2 = xrow2;                                                             \
+    row3 = xrow3;                                                             \
+    row4 = xrow4;                                                             \
+    row5 = xrow5;                                                             \
+    row6 = xrow6;                                                             \
+    row7 = xrow7;                                                             \
+                                                                              \
+    q5 = row7 + row3;                                                         \
+    q4 = row5 + row1;                                                         \
+    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
+         MULTIPLY(q4, FIX_1_175875602);                                       \
+    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
+         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
+    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
+         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
+    q4 = q6;                                                                  \
+    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
+    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
+          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
+    /* now we can use q1 (reloadable constants have been used up) */          \
+    q1 = q3 + q2;                                                             \
+    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
+          MULTIPLY(row1, -FIX_0_899976223);                                   \
+    q5 = q7;                                                                  \
+    q1 = q1 + q6;                                                             \
+    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
+          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
+                                                                              \
+    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
+    tmp11_plus_tmp2 = q1;                                                     \
+    row1 = 0;                                                                 \
+                                                                              \
+    q1 = q1 - q6;                                                             \
+    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
+          MULTIPLY(row3, -FIX_2_562915447);                                   \
+    q1 = q1 - q6;                                                             \
+    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
+         MULTIPLY(row6, FIX_0_541196100);                                     \
+    q3 = q3 - q2;                                                             \
+                                                                              \
+    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
+    tmp11_minus_tmp2 = q1;                                                    \
+                                                                              \
+    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
+    q2 = q1 + q6;                                                             \
+    q1 = q1 - q6;                                                             \
+                                                                              \
+    /* pick up the results */                                                 \
+    tmp0  = q4;                                                               \
+    tmp1  = q5;                                                               \
+    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
+    tmp3  = q7;                                                               \
+    tmp10 = q2;                                                               \
+    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
+    tmp12 = q3;                                                               \
+    tmp13 = q1;                                                               \
+}
+
+#define XFIX_0_899976223                    v0.4h[0]
+#define XFIX_0_541196100                    v0.4h[1]
+#define XFIX_2_562915447                    v0.4h[2]
+#define XFIX_0_298631336_MINUS_0_899976223  v0.4h[3]
+#define XFIX_1_501321110_MINUS_0_899976223  v1.4h[0]
+#define XFIX_2_053119869_MINUS_2_562915447  v1.4h[1]
+#define XFIX_0_541196100_PLUS_0_765366865   v1.4h[2]
+#define XFIX_1_175875602                    v1.4h[3]
+#define XFIX_1_175875602_MINUS_0_390180644  v2.4h[0]
+#define XFIX_0_541196100_MINUS_1_847759065  v2.4h[1]
+#define XFIX_3_072711026_MINUS_2_562915447  v2.4h[2]
+#define XFIX_1_175875602_MINUS_1_961570560  v2.4h[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+    .short FIX_0_899976223                    /* d0[0] */
+    .short FIX_0_541196100                    /* d0[1] */
+    .short FIX_2_562915447                    /* d0[2] */
+    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+    .short FIX_1_175875602                    /* d1[3] */
+    /* reloadable constants */
+    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+    DCT_TABLE       .req x0
+    COEF_BLOCK      .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_COL      .req x3
+    TMP1            .req x0
+    TMP2            .req x1
+    TMP3            .req x2
+    TMP4            .req x15
+
+    ROW0L           .req v16
+    ROW0R           .req v17
+    ROW1L           .req v18
+    ROW1R           .req v19
+    ROW2L           .req v20
+    ROW2R           .req v21
+    ROW3L           .req v22
+    ROW3R           .req v23
+    ROW4L           .req v24
+    ROW4R           .req v25
+    ROW5L           .req v26
+    ROW5R           .req v27
+    ROW6L           .req v28
+    ROW6R           .req v29
+    ROW7L           .req v30
+    ROW7R           .req v31
+
+    adr             x15, jsimd_idct_islow_neon_consts
+    ld1             {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
+    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
+    ld1             {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
+    mul             v16.4h, v16.4h, v0.4h
+    mul             v17.4h, v17.4h, v1.4h
+    ins             v16.2d[1], v17.2d[0]  /* 128 bit q8 */
+    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
+    mul             v18.4h, v18.4h, v2.4h
+    mul             v19.4h, v19.4h, v3.4h
+    ins             v18.2d[1], v19.2d[0]  /* 128 bit q9 */
+    ld1             {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
+    mul             v20.4h, v20.4h, v4.4h
+    mul             v21.4h, v21.4h, v5.4h
+    ins             v20.2d[1], v21.2d[0]  /* 128 bit q10 */
+    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
+    mul             v22.4h, v22.4h, v6.4h
+    mul             v23.4h, v23.4h, v7.4h
+    ins             v22.2d[1], v23.2d[0]  /* 128 bit q11 */
+    ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32
+    mul             v24.4h, v24.4h, v0.4h
+    mul             v25.4h, v25.4h, v1.4h
+    ins             v24.2d[1], v25.2d[0]  /* 128 bit q12 */
+    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
+    mul             v28.4h, v28.4h, v4.4h
+    mul             v29.4h, v29.4h, v5.4h
+    ins             v28.2d[1], v29.2d[0]  /* 128 bit q14 */
+    mul             v26.4h, v26.4h, v2.4h
+    mul             v27.4h, v27.4h, v3.4h
+    ins             v26.2d[1], v27.2d[0]  /* 128 bit q13 */
+    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x15]  /* load constants */
+    add             x15, x15, #16
+    mul             v30.4h, v30.4h, v6.4h
+    mul             v31.4h, v31.4h, v7.4h
+    ins             v30.2d[1], v31.2d[0]  /* 128 bit q15 */
+    sub             sp,	sp, #32
+    st1             {v8.4h-v11.4h}, [sp]  /* save NEON registers */
+    sub             sp, sp, #32
+    st1             {v12.4h-v15.4h}, [sp]
+    /* 1-D IDCT, pass 1, left 4x8 half */
+    add             v4.4h,    ROW7L.4h, ROW3L.4h
+    add             v5.4h,    ROW5L.4h, ROW1L.4h
+    smull           v12.4s,   v4.4h,    XFIX_1_175875602_MINUS_1_961570560
+    smlal           v12.4s,   v5.4h,    XFIX_1_175875602
+    smull           v14.4s,   v4.4h,    XFIX_1_175875602
+      /* Check for the zero coefficients in the right 4x8 half */
+      /* push            {x4, x5} */
+      stp             x4, x5, [sp, -16]!
+      mov             x5, #0
+    smlal           v14.4s,   v5.4h,    XFIX_1_175875602_MINUS_0_390180644
+    ssubl           v6.4s,    ROW0L.4h, ROW4L.4h
+      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
+    smlal           v4.4s,    ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
+      orr             x0,       x4,       x5
+    mov             v8.16b,   v12.16b
+    smlsl           v12.4s,   ROW5L.4h, XFIX_2_562915447
+      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
+    shl             v6.4s,    v6.4s,    #13
+      orr             x0,       x0,       x4
+    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
+      orr             x0,       x0 ,      x5
+    add             v2.4s,    v6.4s,    v4.4s
+      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    mov             v10.16b,  v14.16b
+    add             v2.4s,    v2.4s,    v12.4s
+      orr             x0,       x0,       x4
+    smlsl           v14.4s,   ROW7L.4h, XFIX_0_899976223
+      orr             x0,       x0,       x5
+    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
+    rshrn           ROW1L.4h, v2.4s,    #11
+      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    sub             v2.4s,    v2.4s,    v12.4s
+    smlal           v10.4s,   ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
+      orr             x0,       x0,       x4
+    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
+      orr             x0,       x0,       x5
+    sub             v2.4s,    v2.4s,    v12.4s
+    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
+      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    smlal           v12.4s,   ROW6L.4h, XFIX_0_541196100
+    sub             v6.4s,    v6.4s,    v4.4s
+      orr             x0,       x0,       x4
+    rshrn           ROW6L.4h, v2.4s,    #11
+      orr             x0,       x0,       x5
+    add             v2.4s,    v6.4s,    v10.4s
+      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    sub             v6.4s,    v6.4s,    v10.4s
+    saddl           v10.4s,   ROW0L.4h, ROW4L.4h
+      orr             x0,       x0,       x4
+    rshrn           ROW2L.4h, v2.4s,    #11
+      orr             x0,       x0,       x5
+    rshrn           ROW5L.4h, v6.4s,    #11
+      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    shl             v10.4s,   v10.4s,   #13
+    smlal           v8.4s,    ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
+      orr             x0,       x0,       x4
+    add             v4.4s,    v10.4s,   v12.4s
+      orr             x0,       x0,       x5
+    sub             v2.4s,    v10.4s,   v12.4s
+    add             v12.4s,   v4.4s,    v14.4s
+      ldr             x4,       [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    sub             v4.4s,    v4.4s,    v14.4s
+    add             v10.4s,   v2.4s,    v8.4s
+      orr             x0,       x4,       x5
+    sub             v6.4s,    v2.4s,    v8.4s
+      /* pop             {x4, x5} */ 
+      ldp             x4, x5, [sp], 16
+    rshrn           ROW7L.4h, v4.4s,    #11
+    rshrn           ROW3L.4h, v10.4s,   #11
+    rshrn           ROW0L.4h, v12.4s,   #11
+    rshrn           ROW4L.4h, v6.4s,    #11
+    cmp             x0, #0 /* orrs instruction removed */
+
+      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
+
+    /* 1-D IDCT, pass 1, right 4x8 half */
+    ld1             {v2.4h},  [x15]    /* reload constants */
+    add             v10.4h,   ROW7R.4h, ROW3R.4h
+    add             v8.4h,    ROW5R.4h, ROW1R.4h
+    /* Transpose ROW6L <-> ROW7L   (v3 available free register) */
+    transpose       ROW6L, ROW7L, v3, .16b, .4h
+    smull           v12.4s,   v10.4h,   XFIX_1_175875602_MINUS_1_961570560
+    smlal           v12.4s,   v8.4h,    XFIX_1_175875602
+    /* Transpose ROW2L <-> ROW3L   (v3 available free register) */
+    transpose       ROW2L, ROW3L, v3, .16b, .4h
+    smull           v14.4s,   v10.4h,   XFIX_1_175875602
+    smlal           v14.4s,   v8.4h,    XFIX_1_175875602_MINUS_0_390180644
+    /* Transpose ROW0L <-> ROW1L   (v3 available free register) */
+    transpose       ROW0L, ROW1L, v3, .16b, .4h
+    ssubl           v6.4s,    ROW0R.4h, ROW4R.4h
+    smull           v4.4s,    ROW2R.4h, XFIX_0_541196100
+    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
+    /* Transpose ROW4L <-> ROW5L   (v3 available free register) */
+    transpose       ROW4L, ROW5L, v3, .16b, .4h
+    mov             v8.16b,   v12.16b
+    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
+    smlal           v12.4s,   ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
+    /* Transpose ROW1L <-> ROW3L   (v3 available free register) */
+    transpose       ROW1L, ROW3L, v3, .16b, .2s
+    shl             v6.4s,    v6.4s,    #13
+    smlsl           v8.4s,    ROW1R.4h, XFIX_0_899976223
+    /* Transpose ROW4L <-> ROW6L   (v3 available free register) */
+    transpose       ROW4L, ROW6L, v3, .16b, .2s
+    add             v2.4s,    v6.4s,    v4.4s
+    mov             v10.16b,  v14.16b
+    add             v2.4s,    v2.4s,    v12.4s
+    /* Transpose ROW0L <-> ROW2L   (v3 available free register) */
+    transpose       ROW0L, ROW2L, v3, .16b, .2s
+    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
+    smlal           v14.4s,   ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
+    rshrn           ROW1R.4h, v2.4s,    #11
+    /* Transpose ROW5L <-> ROW7L   (v3 available free register) */
+    transpose       ROW5L, ROW7L, v3, .16b, .2s
+    sub             v2.4s,    v2.4s,    v12.4s
+    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
+    smlsl           v10.4s,   ROW3R.4h, XFIX_2_562915447
+    sub             v2.4s,    v2.4s,    v12.4s
+    smull           v12.4s,   ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
+    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
+    sub             v6.4s,    v6.4s,    v4.4s
+    rshrn           ROW6R.4h, v2.4s,    #11
+    add             v2.4s,    v6.4s,    v10.4s
+    sub             v6.4s,    v6.4s,    v10.4s
+    saddl           v10.4s,   ROW0R.4h, ROW4R.4h
+    rshrn           ROW2R.4h, v2.4s,    #11
+    rshrn           ROW5R.4h, v6.4s,    #11
+    shl             v10.4s,   v10.4s,   #13
+    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
+    add             v4.4s,    v10.4s,   v12.4s
+    sub             v2.4s,    v10.4s,   v12.4s
+    add             v12.4s,   v4.4s,    v14.4s
+    sub             v4.4s,    v4.4s,    v14.4s
+    add             v10.4s,   v2.4s,    v8.4s
+    sub             v12.4s,   v2.4s,    v8.4s
+    rshrn           ROW7R.4h, v4.4s,    #11
+    rshrn           ROW3R.4h, v10.4s,   #11
+    rshrn           ROW0R.4h, v12.4s,   #11
+    rshrn           ROW4R.4h, v6.4s,    #11
+    /* Transpose right 4x8 half */
+    transpose       ROW6R, ROW7R, v3, .16b, .4h
+    transpose       ROW2R, ROW3R, v3, .16b, .4h
+    transpose       ROW0R, ROW1R, v3, .16b, .4h
+    transpose       ROW4R, ROW5R, v3, .16b, .4h
+    transpose       ROW1R, ROW3R, v3, .16b, .2s
+    transpose       ROW4R, ROW6R, v3, .16b, .2s
+    transpose       ROW0R, ROW2R, v3, .16b, .2s
+    transpose       ROW5R, ROW7R, v3, .16b, .2s
+
+1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+    ld1             {v2.4h},  [x15]    /* reload constants */
+    smull           v12.4S,   ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
+    smlal           v12.4s,   ROW1L.4h, XFIX_1_175875602
+    smlal           v12.4s,   ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
+    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
+    smull           v14.4s,   ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
+    smlal           v14.4s,   ROW3L.4h, XFIX_1_175875602
+    smlal           v14.4s,   ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
+    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
+    ssubl           v6.4s,    ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
+    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
+    smlal           v4.4s,    ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */
+    mov             v8.16b,   v12.16b
+    smlsl           v12.4s,   ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
+    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
+    shl             v6.4s,    v6.4s,    #13
+    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
+    add             v2.4s,    v6.4s,    v4.4s
+    mov             v10.16b,  v14.16b
+    add             v2.4s,    v2.4s,    v12.4s
+    smlsl           v14.4s,   ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
+    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
+    shrn            ROW1L.4h, v2.4s,    #16
+    sub             v2.4s,    v2.4s,    v12.4s
+    smlal           v10.4s,   ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
+    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
+    sub             v2.4s,    v2.4s,    v12.4s
+    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
+    smlal           v12.4s,   ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
+    sub             v6.4s,    v6.4s,    v4.4s
+    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
+    add             v2.4s,    v6.4s,    v10.4s
+    sub             v6.4s,    v6.4s,    v10.4s
+    saddl           v10.4s,   ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
+    shrn            ROW2L.4h, v2.4s,    #16
+    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
+    shl             v10.4s,   v10.4s,   #13
+    smlal           v8.4s,    ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
+    add             v4.4s,    v10.4s,   v12.4s
+    sub             v2.4s,    v10.4s,   v12.4s
+    add             v12.4s,   v4.4s,    v14.4s
+    sub             v4.4s,    v4.4s,    v14.4s
+    add             v10.4s,   v2.4s,    v8.4s
+    sub             v6.4s,    v2.4s,    v8.4s
+    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
+    shrn            ROW3L.4h, v10.4s,   #16
+    shrn            ROW0L.4h, v12.4s,   #16
+    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
+    /* 1-D IDCT, pass 2, right 4x8 half */
+    ld1             {v2.4h},  [x15]    /* reload constants */
+    smull           v12.4s,   ROW5R.4h, XFIX_1_175875602
+    smlal           v12.4s,   ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
+    smlal           v12.4s,   ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
+    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
+    smull           v14.4s,   ROW7R.4h, XFIX_1_175875602
+    smlal           v14.4s,   ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
+    smlal           v14.4s,   ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
+    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
+    ssubl           v6.4s,    ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
+    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
+    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
+    mov             v8.16b,   v12.16b
+    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
+    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
+    shl             v6.4s,    v6.4s,    #13
+    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
+    add             v2.4s,    v6.4s,    v4.4s
+    mov             v10.16b,  v14.16b
+    add             v2.4s,    v2.4s,    v12.4s
+    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
+    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
+    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
+    sub             v2.4s,    v2.4s,    v12.4s
+    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
+    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
+    sub             v2.4s,    v2.4s,    v12.4s
+    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */
+    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
+    sub             v6.4s,    v6.4s,    v4.4s
+    shrn            ROW6R.4h, v2.4s,    #16
+    add             v2.4s,    v6.4s,    v10.4s
+    sub             v6.4s,    v6.4s,    v10.4s
+    saddl           v10.4s,   ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
+    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
+    shrn            ROW5R.4h, v6.4s,    #16
+    shl             v10.4s,   v10.4s,   #13
+    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
+    add             v4.4s,    v10.4s,   v12.4s
+    sub             v2.4s,    v10.4s,   v12.4s
+    add             v12.4s,   v4.4s,    v14.4s
+    sub             v4.4s,    v4.4s,    v14.4s
+    add             v10.4s,   v2.4s,    v8.4s
+    sub             v6.4s,    v2.4s,    v8.4s
+    shrn            ROW7R.4h, v4.4s,    #16
+    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
+    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
+    shrn            ROW4R.4h, v6.4s,    #16
+
+2:  /* Descale to 8-bit and range limit */
+    ins             v16.2d[1], v17.2d[0]
+    ins             v18.2d[1], v19.2d[0]
+    ins             v20.2d[1], v21.2d[0]
+    ins             v22.2d[1], v23.2d[0]
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+    sqrshrn         v16.8b,   v16.8h,   #2
+    sqrshrn2        v16.16b,  v18.8h,   #2
+    sqrshrn         v18.8b,   v20.8h,   #2
+    sqrshrn2        v18.16b,  v22.8h,   #2
+#else
+    sqrshrn         v16.4h,   v16.4s,   #2
+    sqrshrn2        v16.8h,   v18.4s,   #2
+    sqrshrn         v18.4h,   v20.4s,   #2
+    sqrshrn2        v18.8h,   v22.4s,   #2
+#endif
+    /* vpop            {v8.4h-d15.4h} */ /* restore NEON registers */
+
+    ld1             {v12.4h-v15.4h}, [sp], 32
+    ld1             {v8.4h-v11.4h}, [sp], 32
+    ins             v24.2d[1], v25.2d[0]
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+
+    sqrshrn         v20.8b,   v24.8h,   #2
+#else
+
+    sqrshrn         v20.4h,   v24.4s,   #2
+#endif
+      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+    /* trn1            v16.8h,    v16.8h,  v18.8h */
+    transpose       v16, v18, v3, .16b, .8h
+    ins             v26.2d[1], v27.2d[0]
+    ins             v28.2d[1], v29.2d[0]
+    ins             v30.2d[1], v31.2d[0]
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+    sqrshrn2        v20.16b,  v26.8h,   #2
+    sqrshrn         v22.8b,   v28.8h,   #2
+#else
+    sqrshrn2        v20.8h,   v26.4s,   #2
+    sqrshrn         v22.4h,   v28.4s,   #2
+#endif
+    movi            v0.16b,   #(CENTERJSAMPLE)
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+    sqrshrn2        v22.16b,  v30.8h,   #2
+#else
+    sqrshrn2        v22.8h,   v30.4s,   #2
+#endif
+    transpose_single v16, v17, v3, .2d, .8b
+    transpose_single v18, v19, v3, .2d, .8b
+    add             v16.8b,   v16.8b,   v0.8b
+    add             v17.8b,   v17.8b,   v0.8b
+    add             v18.8b,   v18.8b,   v0.8b
+    add             v19.8b,   v19.8b,   v0.8b
+    transpose       v20, v22, v3, .16b, .8h
+    /* Store results to the output buffer */
+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
+    add             TMP1,     TMP1,     OUTPUT_COL
+    add             TMP2,     TMP2,     OUTPUT_COL
+    st1             {v16.8b}, [TMP1]
+    transpose_single v20, v21, v3, .2d, .8b
+    st1             {v17.8b}, [TMP2]
+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
+    add             TMP1,     TMP1,     OUTPUT_COL
+    add             TMP2,     TMP2,     OUTPUT_COL
+    st1             {v18.8b}, [TMP1]
+    add             v20.8b,   v20.8b,   v0.8b
+    add             v21.8b,   v21.8b,   v0.8b
+    st1             {v19.8b}, [TMP2]
+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
+    ldp             TMP3,     TMP4,     [OUTPUT_BUF]
+    add             TMP1,     TMP1,     OUTPUT_COL
+    add             TMP2,     TMP2,     OUTPUT_COL
+    add             TMP3,     TMP3,     OUTPUT_COL
+    add             TMP4,     TMP4,     OUTPUT_COL
+    transpose_single v22, v23, v3, .2d, .8b
+    st1             {v20.8b}, [TMP1]
+    add             v22.8b,   v22.8b,   v0.8b
+    add             v23.8b,   v23.8b,   v0.8b
+    st1             {v21.8b}, [TMP2]
+    st1             {v22.8b}, [TMP3]
+    st1             {v23.8b}, [TMP4]
+    blr             x30
+
+3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+    /* Transpose left 4x8 half */
+    transpose       ROW6L, ROW7L, v3, .16b, .4h
+    transpose       ROW2L, ROW3L, v3, .16b, .4h
+    transpose       ROW0L, ROW1L, v3, .16b, .4h
+    transpose       ROW4L, ROW5L, v3, .16b, .4h
+    shl             ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
+    transpose       ROW1L, ROW3L, v3, .16b, .2s
+    transpose       ROW4L, ROW6L, v3, .16b, .2s
+    transpose       ROW0L, ROW2L, v3, .16b, .2s
+    transpose       ROW5L, ROW7L, v3, .16b, .2s
+    cmp             x0, #0
+    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+
+    /* Only row 0 is non-zero for the right 4x8 half  */
+    dup             ROW1R.4h, ROW0R.4h[1]
+    dup             ROW2R.4h, ROW0R.4h[2]
+    dup             ROW3R.4h, ROW0R.4h[3]
+    dup             ROW4R.4h, ROW0R.4h[0]
+    dup             ROW5R.4h, ROW0R.4h[1]
+    dup             ROW6R.4h, ROW0R.4h[2]
+    dup             ROW7R.4h, ROW0R.4h[3]
+    dup             ROW0R.4h, ROW0R.4h[0]
+    b               1b /* Go to 'normal' second pass */
+
+4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+    ld1             {v2.4h},  [x15]    /* reload constants */
+    smull           v12.4s,   ROW1L.4h, XFIX_1_175875602
+    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
+    smull           v14.4s,   ROW3L.4h, XFIX_1_175875602
+    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
+    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
+    sshll           v6.4s,    ROW0L.4h, #13
+    mov             v8.16b,   v12.16b
+    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
+    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
+    add             v2.4s,    v6.4s,    v4.4s
+    mov             v10.16b,  v14.16b
+    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
+    add             v2.4s,    v2.4s,    v12.4s
+    add             v12.4s,   v12.4s,   v12.4s
+    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
+    shrn            ROW1L.4h, v2.4s,    #16
+    sub             v2.4s,    v2.4s,    v12.4s
+    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
+    sub             v6.4s,    v6.4s,    v4.4s
+    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
+    add             v2.4s,    v6.4s,    v10.4s
+    sub             v6.4s,    v6.4s,    v10.4s
+    sshll           v10.4s,   ROW0L.4h, #13
+    shrn            ROW2L.4h, v2.4s,    #16
+    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
+    add             v4.4s,    v10.4s,   v12.4s
+    sub             v2.4s,    v10.4s,   v12.4s
+    add             v12.4s,   v4.4s,    v14.4s
+    sub             v4.4s,    v4.4s,    v14.4s
+    add             v10.4s,   v2.4s,    v8.4s
+    sub             v6.4s,    v2.4s,    v8.4s
+    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
+    shrn            ROW3L.4h, v10.4s,   #16
+    shrn            ROW0L.4h, v12.4s,   #16
+    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
+    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+    ld1             {v2.4h},  [x15]    /* reload constants */
+    smull           v12.4s,   ROW5L.4h, XFIX_1_175875602
+    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
+    smull           v14.4s,   ROW7L.4h, XFIX_1_175875602
+    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
+    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100
+    sshll           v6.4s,    ROW4L.4h, #13
+    mov             v8.16b,   v12.16b
+    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
+    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223
+    add             v2.4s,    v6.4s,    v4.4s
+    mov             v10.16b,  v14.16b
+    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
+    add             v2.4s,    v2.4s,    v12.4s
+    add             v12.4s,   v12.4s,   v12.4s
+    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447
+    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
+    sub             v2.4s,    v2.4s,    v12.4s
+    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
+    sub             v6.4s,    v6.4s,    v4.4s
+    shrn            ROW6R.4h, v2.4s,    #16
+    add             v2.4s,    v6.4s,    v10.4s
+    sub             v6.4s,    v6.4s,    v10.4s
+    sshll           v10.4s,   ROW4L.4h, #13
+    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
+    shrn            ROW5R.4h, v6.4s,    #16
+    add             v4.4s,    v10.4s,   v12.4s
+    sub             v2.4s,    v10.4s,   v12.4s
+    add             v12.4s,   v4.4s,    v14.4s
+    sub             v4.4s,    v4.4s,    v14.4s
+    add             v10.4s,   v2.4s,    v8.4s
+    sub             v6.4s,    v2.4s,    v8.4s
+    shrn            ROW7R.4h, v4.4s,    #16
+    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
+    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
+    shrn            ROW4R.4h, v6.4s,    #16
+    b               2b /* Go to epilogue */
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+    .unreq          ROW0L
+    .unreq          ROW0R
+    .unreq          ROW1L
+    .unreq          ROW1R
+    .unreq          ROW2L
+    .unreq          ROW2R
+    .unreq          ROW3L
+    .unreq          ROW3R
+    .unreq          ROW4L
+    .unreq          ROW4R
+    .unreq          ROW5L
+    .unreq          ROW5R
+    .unreq          ROW6L
+    .unreq          ROW6R
+    .unreq          ROW7L
+    .unreq          ROW7R
+.endfunc
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in ARM NEON case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200 v0.4h[0]
+#define XFIX_1_414213562 v0.4h[1]
+#define XFIX_1_847759065 v0.4h[2]
+#define XFIX_2_613125930 v0.4h[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
+    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
+    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
+    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+    DCT_TABLE       .req x0
+    COEF_BLOCK      .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_COL      .req x3
+    TMP1            .req x0
+    TMP2            .req x1
+    TMP3            .req x2
+    TMP4            .req x15
+
+    /* Load and dequantize coefficients into NEON registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( v8.8h  )
+     *   1 | d18     | d19     ( v9.8h  )
+     *   2 | d20     | d21     ( v10.8h )
+     *   3 | d22     | d23     ( v11.8h )
+     *   4 | d24     | d25     ( v12.8h )
+     *   5 | d26     | d27     ( v13.8h )
+     *   6 | d28     | d29     ( v14.8h )
+     *   7 | d30     | d31     ( v15.8h )
+     */
+    adr             x15, jsimd_idct_ifast_neon_consts
+    ld1             {v8.8h, v9.8h}, [COEF_BLOCK], 32
+    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
+    ld1             {v10.8h, v11.8h}, [COEF_BLOCK], 32
+    mul             v8.8h,  v8.8h,  v0.8h
+    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
+    mul             v9.8h,  v9.8h,  v1.8h
+    ld1             {v12.8h, v13.8h}, [COEF_BLOCK], 32
+    mul             v10.8h, v10.8h, v2.8h
+    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
+    mul             v11.8h, v11.8h, v3.8h
+    ld1             {v14.8h, v15.8h}, [COEF_BLOCK], 32
+    mul             v12.8h, v12.8h, v0.8h
+    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
+    mul             v14.8h, v14.8h, v2.8h
+    mul             v13.8h, v13.8h, v1.8h
+    ld1             {v0.4h}, [x15]      /* load constants */
+    mul             v15.8h, v15.8h, v3.8h
+
+    /* vpush           {v4.8h-v6.8h} */ /* save NEON registers */
+    sub             sp, sp, #32
+    st1             {v4.8h-v5.8h}, [sp] /* save NEON registers */
+    sub             sp, sp, #16
+    st1             {v6.8h}, [sp]
+    /* 1-D IDCT, pass 1 */
+    sub             v2.8h,    v10.8h,   v14.8h
+    add             v14.8h,   v10.8h,   v14.8h
+    sub             v1.8h,    v11.8h,   v13.8h
+    add             v13.8h,   v11.8h,   v13.8h
+    sub             v5.8h,    v9.8h,    v15.8h
+    add             v15.8h,   v9.8h,    v15.8h
+    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
+    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
+    add             v3.8h,    v1.8h,    v1.8h
+    sub             v1.8h,    v5.8h,    v1.8h
+    add             v10.8h,   v2.8h,    v4.8h
+    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
+    sub             v2.8h,    v15.8h,   v13.8h
+    add             v3.8h,    v3.8h,    v6.8h
+    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
+    add             v1.8h,    v1.8h,    v4.8h
+    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
+    sub             v10.8h,   v10.8h,   v14.8h
+    add             v2.8h,    v2.8h,    v6.8h
+    sub             v6.8h,    v8.8h,    v12.8h
+    add             v12.8h,   v8.8h,    v12.8h
+    add             v9.8h,    v5.8h,    v4.8h
+    add             v5.8h,    v6.8h,    v10.8h
+    sub             v10.8h,   v6.8h,    v10.8h
+    add             v6.8h,    v15.8h,   v13.8h
+    add             v8.8h,    v12.8h,   v14.8h
+    sub             v3.8h,    v6.8h,    v3.8h
+    sub             v12.8h,   v12.8h,   v14.8h
+    sub             v3.8h,    v3.8h,    v1.8h
+    sub             v1.8h,    v9.8h,    v1.8h
+    add             v2.8h,    v3.8h,    v2.8h
+    sub             v15.8h,   v8.8h,    v6.8h
+    add             v1.8h,    v1.8h,    v2.8h
+    add             v8.8h,    v8.8h,    v6.8h
+    add             v14.8h,   v5.8h,    v3.8h
+    sub             v9.8h,    v5.8h,    v3.8h
+    sub             v13.8h,   v10.8h,   v2.8h
+    add             v10.8h,   v10.8h,   v2.8h
+    /* Transpose  q8-q9 */
+    mov             v18.16b,  v8.16b
+    trn1            v8.8h,    v8.8h,    v9.8h
+    trn2            v9.8h,    v18.8h,   v9.8h
+    sub             v11.8h,   v12.8h,   v1.8h
+    /* Transpose  q14-q15 */
+    mov             v18.16b,  v14.16b
+    trn1            v14.8h,   v14.8h,   v15.8h
+    trn2            v15.8h,   v18.8h,   v15.8h
+    add             v12.8h,   v12.8h,   v1.8h
+    /* Transpose  q10-q11 */
+    mov             v18.16b,  v10.16b
+    trn1            v10.8h,   v10.8h,   v11.8h
+    trn2            v11.8h,   v18.8h,   v11.8h
+    /* Transpose  q12-q13 */
+    mov             v18.16b,  v12.16b
+    trn1            v12.8h,   v12.8h,   v13.8h
+    trn2            v13.8h,   v18.8h,   v13.8h
+    /* Transpose  q9-q11 */
+    mov             v18.16b,  v9.16b
+    trn1            v9.4s,    v9.4s,    v11.4s
+    trn2            v11.4s,   v18.4s,   v11.4s
+    /* Transpose  q12-q14 */
+    mov             v18.16b,  v12.16b
+    trn1            v12.4s,   v12.4s,   v14.4s
+    trn2            v14.4s,   v18.4s,   v14.4s
+    /* Transpose  q8-q10 */
+    mov             v18.16b,  v8.16b
+    trn1            v8.4s,    v8.4s,    v10.4s
+    trn2            v10.4s,   v18.4s,   v10.4s
+    /* Transpose  q13-q15 */
+    mov             v18.16b,  v13.16b
+    trn1            v13.4s,   v13.4s,   v15.4s
+    trn2            v15.4s,   v18.4s,   v15.4s
+    /* vswp            v14.4h,   v10-MSB.4h */
+    umov            x10, v14.d[0]
+    ins             v14.2d[0], v10.2d[1]
+    ins             v10.2d[1], x10
+    /* vswp            v13.4h,   v9MSB.4h */
+
+    umov            x10, v13.d[0]
+    ins             v13.2d[0], v9.2d[1]
+    ins             v9.2d[1], x10
+    /* 1-D IDCT, pass 2 */
+    sub             v2.8h,    v10.8h,   v14.8h
+    /* vswp            v15.4h,   v11MSB.4h */
+    umov            x10, v15.d[0]
+    ins             v15.2d[0], v11.2d[1]
+    ins             v11.2d[1], x10
+    add             v14.8h,   v10.8h,   v14.8h
+    /* vswp            v12.4h,   v8-MSB.4h */
+    umov            x10, v12.d[0]
+    ins             v12.2d[0], v8.2d[1]
+    ins             v8.2d[1], x10
+    sub             v1.8h,    v11.8h,   v13.8h
+    add             v13.8h,   v11.8h,   v13.8h
+    sub             v5.8h,    v9.8h,    v15.8h
+    add             v15.8h,   v9.8h,    v15.8h
+    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
+    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
+    add             v3.8h,    v1.8h,    v1.8h
+    sub             v1.8h,    v5.8h,    v1.8h
+    add             v10.8h,   v2.8h,    v4.8h
+    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
+    sub             v2.8h,    v15.8h,   v13.8h
+    add             v3.8h,    v3.8h,    v6.8h
+    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
+    add             v1.8h,    v1.8h,    v4.8h
+    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
+    sub             v10.8h,   v10.8h,   v14.8h
+    add             v2.8h,    v2.8h,    v6.8h
+    sub             v6.8h,    v8.8h,    v12.8h
+    add             v12.8h,   v8.8h,    v12.8h
+    add             v9.8h,    v5.8h,    v4.8h
+    add             v5.8h,    v6.8h,    v10.8h
+    sub             v10.8h,   v6.8h,    v10.8h
+    add             v6.8h,    v15.8h,   v13.8h
+    add             v8.8h,    v12.8h,   v14.8h
+    sub             v3.8h,    v6.8h,    v3.8h
+    sub             v12.8h,   v12.8h,   v14.8h
+    sub             v3.8h,    v3.8h,    v1.8h
+    sub             v1.8h,    v9.8h,    v1.8h
+    add             v2.8h,    v3.8h,    v2.8h
+    sub             v15.8h,   v8.8h,    v6.8h
+    add             v1.8h,    v1.8h,    v2.8h
+    add             v8.8h,    v8.8h,    v6.8h
+    add             v14.8h,   v5.8h,    v3.8h
+    sub             v9.8h,    v5.8h,    v3.8h
+    sub             v13.8h,   v10.8h,   v2.8h
+    /* vpop            {v4.8h-v7.4h} */  /* restore NEON registers...not available */
+    ld1             {v6.8h}, [sp], 16
+    ld1             {v4.8h-v5.8h}, [sp], 32
+    add             v10.8h,   v10.8h,   v2.8h
+    sub             v11.8h,   v12.8h,   v1.8h
+    add             v12.8h,   v12.8h,   v1.8h
+    /* Descale to 8-bit and range limit */
+    movi            v0.16b,   #0x80
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+    sqshrn          v8.8b,    v8.8h,    #5
+    sqshrn2         v8.16b,   v9.8h,    #5
+    sqshrn          v9.8b,    v10.8h,   #5
+    sqshrn2         v9.16b,   v11.8h,   #5
+    sqshrn          v10.8b,   v12.8h,   #5
+    sqshrn2         v10.16b,  v13.8h,   #5
+    sqshrn          v11.8b,   v14.8h,   #5
+    sqshrn2         v11.16b,  v15.8h,   #5
+#else
+    sqshrn          v8.4h,    v8.4s,    #5
+    sqshrn2         v8.8h,    v9.4s,    #5
+    sqshrn          v9.4h,    v10.4s,   #5
+    sqshrn2         v9.8h,    v11.4s,   #5
+    sqshrn          v10.4h,   v12.4s,   #5
+    sqshrn2         v10.8h,   v13.4s,   #5
+    sqshrn          v11.4h,   v14.4s,   #5
+    sqshrn2         v11.8h,   v15.4s,   #5
+#endif
+    add             v8.16b,   v8.16b,   v0.16b
+    add             v9.16b,   v9.16b,   v0.16b
+    add             v10.16b,  v10.16b,  v0.16b
+    add             v11.16b,  v11.16b,  v0.16b
+    /* Transpose the final 8-bit samples */
+    /* Transpose  q8-q9 */
+    mov             v18.16b,  v8.16b
+    trn1            v8.8h,    v8.8h,    v9.8h
+    trn2            v9.8h,    v18.8h,   v9.8h
+    /* Transpose  q10-q11 */
+    mov             v18.16b,  v10.16b
+    trn1            v10.8h,   v10.8h,   v11.8h
+    trn2            v11.8h,   v18.8h,   v11.8h
+    /* Transpose  q8-q10 */
+    mov             v18.16b,  v8.16b
+    trn1            v8.4s,    v8.4s,    v10.4s
+    trn2            v10.4s,   v18.4s,   v10.4s
+    /* Transpose  q9-q11 */
+    mov             v18.16b,  v9.16b
+    trn1            v9.4s,    v9.4s,    v11.4s
+    trn2            v11.4s,   v18.4s,   v11.4s
+    /* make copy */
+    ins             v17.2d[0], v8.2d[1]
+    /* Transpose  d16-d17-msb */
+    mov             v18.16b,  v8.16b
+    trn1            v8.8b,    v8.8b,    v17.8b
+    trn2            v17.8b,   v18.8b,   v17.8b
+    /* make copy */
+    ins             v19.2d[0], v9.2d[1]
+    mov             v18.16b,  v9.16b
+    trn1            v9.8b,    v9.8b,    v19.8b
+    trn2            v19.8b,   v18.8b,   v19.8b
+    /* Store results to the output buffer */
+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
+    add             TMP1,     TMP1,     OUTPUT_COL
+    add             TMP2,     TMP2,     OUTPUT_COL
+    st1             {v8.8b},  [TMP1]
+    st1             {v17.8b}, [TMP2]
+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
+    add             TMP1,     TMP1,     OUTPUT_COL
+    add             TMP2,     TMP2,     OUTPUT_COL
+    st1             {v9.8b},  [TMP1]
+    /* make copy */
+    ins             v21.2d[0], v10.2d[1]
+    mov             v18.16b,  v10.16b
+    trn1            v10.8b,   v10.8b,   v21.8b
+    trn2            v21.8b,   v18.8b,   v21.8b
+    st1             {v19.8b}, [TMP2]
+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
+    ldp             TMP3,     TMP4,     [OUTPUT_BUF]
+    add             TMP1,     TMP1,     OUTPUT_COL
+    add             TMP2,     TMP2,     OUTPUT_COL
+    add             TMP3,     TMP3,     OUTPUT_COL
+    add             TMP4,     TMP4,     OUTPUT_COL
+    st1             {v10.8b}, [TMP1]
+    /* make copy */
+    ins             v23.2d[0], v11.2d[1]
+    mov             v18.16b,  v11.16b
+    trn1            v11.8b,   v11.8b,   v23.8b
+    trn2            v23.8b,   v18.8b,   v23.8b
+    st1             {v21.8b}, [TMP2]
+    st1             {v11.8b}, [TMP3]
+    st1             {v23.8b}, [TMP4]
+    blr             x30
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+.endfunc
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_4x4_neon
+ *
+ * This function contains inverse-DCT code for getting reduced-size
+ * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
+ * function from jpeg-6b (jidctred.c).
+ *
+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
+ *       requires much less arithmetic operations and hence should be faster.
+ *       The primary purpose of this particular NEON optimized function is
+ *       bit exact compatibility with jpeg-6b.
+ *
+ * TODO: a bit better instructions scheduling can be achieved by expanding
+ *       idct_helper/transpose_4x4 macros and reordering instructions,
+ *       but readability will suffer somewhat.
+ */
+
+#define CONST_BITS  13
+
+#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
+#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
+#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
+#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
+#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
+#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
+#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
+#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
+#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
+#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
+#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
+#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
+#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
+#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+
+.balign 16
+jsimd_idct_4x4_neon_consts:
+    .short     FIX_1_847759065     /* v0.4h[0] */
+    .short     -FIX_0_765366865    /* v0.4h[1] */
+    .short     -FIX_0_211164243    /* v0.4h[2] */
+    .short     FIX_1_451774981     /* v0.4h[3] */
+    .short     -FIX_2_172734803    /* d1[0] */
+    .short     FIX_1_061594337     /* d1[1] */
+    .short     -FIX_0_509795579    /* d1[2] */
+    .short     -FIX_0_601344887    /* d1[3] */
+    .short     FIX_0_899976223     /* v2.4h[0] */
+    .short     FIX_2_562915447     /* v2.4h[1] */
+    .short     1 << (CONST_BITS+1) /* v2.4h[2] */
+    .short     0                   /* v2.4h[3] */
+
+.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
+    smull           v28.4s, \x4,    v2.4h[2]
+    smlal           v28.4s, \x8,    v0.4h[0]
+    smlal           v28.4s, \x14,   v0.4h[1]
+                                    
+    smull           v26.4s, \x16,   v1.4h[2]
+    smlal           v26.4s, \x12,   v1.4h[3]
+    smlal           v26.4s, \x10,   v2.4h[0]
+    smlal           v26.4s, \x6,    v2.4h[1]
+                                    
+    smull           v30.4s, \x4,    v2.4h[2]
+    smlsl           v30.4s, \x8,    v0.4h[0]
+    smlsl           v30.4s, \x14,   v0.4h[1]
+                                    
+    smull           v24.4s, \x16,   v0.4h[2]
+    smlal           v24.4s, \x12,   v0.4h[3]
+    smlal           v24.4s, \x10,   v1.4h[0]
+    smlal           v24.4s, \x6,    v1.4h[1]
+                                    
+    add             v20.4s, v28.4s, v26.4s
+    sub             v28.4s, v28.4s, v26.4s
+
+.if \shift > 16
+    srshr           v20.4s, v20.4s, #\shift
+    srshr           v28.4s, v28.4s, #\shift
+    xtn             \y26,   v20.4s
+    xtn             \y29,   v28.4s
+.else               
+    rshrn           \y26,   v20.4s, #\shift
+    rshrn           \y29,   v28.4s, #\shift
+.endif              
+                    
+    add             v20.4s, v30.4s, v24.4s
+    sub             v30.4s, v30.4s, v24.4s
+
+.if \shift > 16
+    srshr           v20.4s, v20.4s, #\shift
+    srshr           v30.4s, v30.4s, #\shift
+    xtn             \y27,   v20.4s
+    xtn             \y28,   v30.4s
+.else
+    rshrn           \y27,   v20.4s, #\shift
+    rshrn           \y28,   v30.4s, #\shift
+.endif
+
+.endm
+
+asm_function jsimd_idct_4x4_neon
+
+    DCT_TABLE       .req x0
+    COEF_BLOCK      .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_COL      .req x3
+    TMP1            .req x0
+    TMP2            .req x1
+    TMP3            .req x2
+    TMP4            .req x15
+
+    /* vpush           {v8.4h-v15.4h} */
+    sub             sp, sp, #32
+    st1             {v8.4h-v11.4h}, [sp] /* save NEON registers */
+    sub             sp, sp, #32
+    st1             {v12.4h-v15.4h}, [sp]
+
+    /* Load constants (v3.4h is just used for padding) */
+    adr             TMP4, jsimd_idct_4x4_neon_consts
+    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
+
+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | v4.4h   | v5.4h
+     *   1 | v6.4h   | v7.4h
+     *   2 | v8.4h   | v9.4h
+     *   3 | v10.4h  | v11.4h
+     *   4 | -       | -
+     *   5 | v12.4h  | v13.4h
+     *   6 | v14.4h  | v15.4h
+     *   7 | v16.4h  | v17.4h
+     */
+    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
+    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
+    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
+    /* dequantize */
+    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
+    mul             v4.4h, v4.4h, v18.4h
+    mul             v5.4h, v5.4h, v19.4h
+    ins             v4.2d[1], v5.2d[0]    /* 128 bit q4 */
+    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
+    mul             v6.4h, v6.4h, v20.4h
+    mul             v7.4h, v7.4h, v21.4h
+    ins             v6.2d[1], v7.2d[0]    /* 128 bit q6 */
+    mul             v8.4h, v8.4h, v22.4h
+    mul             v9.4h, v9.4h, v23.4h
+    ins             v8.2d[1], v9.2d[0]    /* 128 bit q8 */
+    add             DCT_TABLE, DCT_TABLE, #16
+    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
+    mul             v10.4h, v10.4h, v24.4h
+    mul             v11.4h, v11.4h, v25.4h
+    ins             v10.2d[1], v11.2d[0]  /* 128 bit q10 */
+    mul             v12.4h, v12.4h, v26.4h
+    mul             v13.4h, v13.4h, v27.4h
+    ins             v12.2d[1], v13.2d[0]  /* 128 bit q12 */
+    ld1             {v30.8h}, [DCT_TABLE], 16
+    mul             v14.4h, v14.4h, v28.4h
+    mul             v15.4h, v15.4h, v29.4h
+    ins             v14.2d[1], v15.2d[0]  /* 128 bit q14 */
+    mul             v16.4h, v16.4h, v30.4h
+    mul             v17.4h, v17.4h, v31.4h
+    ins             v16.2d[1], v17.2d[0]  /* 128 bit q16 */
+
+    /* Pass 1 */
+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
+    transpose_4x4   v4, v6, v8, v10, v3
+    ins             v10.2d[1], v11.2d[0]
+    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
+    transpose_4x4   v5, v7, v9, v11, v3
+    ins             v10.2d[1], v11.2d[0]
+    /* Pass 2 */
+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
+    transpose_4x4   v26, v27, v28, v29, v3
+
+    /* Range limit */
+    movi            v30.8h, #0x80
+    ins             v26.2d[1], v27.2d[0]
+    ins             v28.2d[1], v29.2d[0]
+    add             v26.8h, v26.8h, v30.8h
+    add             v28.8h, v28.8h, v30.8h
+    sqxtun          v26.8b, v26.8h
+    sqxtun          v27.8b, v28.8h
+
+    /* Store results to the output buffer */
+    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    ldp             TMP3, TMP4, [OUTPUT_BUF]
+    add             TMP1, TMP1, OUTPUT_COL
+    add             TMP2, TMP2, OUTPUT_COL
+    add             TMP3, TMP3, OUTPUT_COL
+    add             TMP4, TMP4, OUTPUT_COL
+
+#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
+    /* We can use much less instructions on little endian systems if the
+     * OS kernel is not configured to trap unaligned memory accesses
+     */
+    st1             {v26.s}[0], [TMP1], 4
+    st1             {v27.s}[0], [TMP3], 4
+    st1             {v26.s}[1], [TMP2], 4
+    st1             {v27.s}[1], [TMP4], 4
+#else
+    st1             {v26.b}[0], [TMP1], 1
+    st1             {v27.b}[0], [TMP3], 1
+    st1             {v26.b}[1], [TMP1], 1
+    st1             {v27.b}[1], [TMP3], 1
+    st1             {v26.b}[2], [TMP1], 1
+    st1             {v27.b}[2], [TMP3], 1
+    st1             {v26.b}[3], [TMP1], 1
+    st1             {v27.b}[3], [TMP3], 1
+                    
+    st1             {v26.b}[4], [TMP2], 1
+    st1             {v27.b}[4], [TMP4], 1
+    st1             {v26.b}[5], [TMP2], 1
+    st1             {v27.b}[5], [TMP4], 1
+    st1             {v26.b}[6], [TMP2], 1
+    st1             {v27.b}[6], [TMP4], 1
+    st1             {v26.b}[7], [TMP2], 1
+    st1             {v27.b}[7], [TMP4], 1
+#endif
+
+    /* vpop            {v8.4h-v15.4h}    ;not available */
+    ld1             {v12.4h-v15.4h}, [sp], 32
+    ld1             {v8.4h-v11.4h}, [sp], 32
+
+    blr             x30
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+.endfunc
+
+.purgem idct_helper
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_2x2_neon
+ *
+ * This function contains inverse-DCT code for getting reduced-size
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
+ * function from jpeg-6b (jidctred.c).
+ *
+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
+ *       requires much less arithmetic operations and hence should be faster.
+ *       The primary purpose of this particular NEON optimized function is
+ *       bit exact compatibility with jpeg-6b.
+ */
+
+.balign 8
+jsimd_idct_2x2_neon_consts:
+    .short     -FIX_0_720959822    /* d0[0] */
+    .short     FIX_0_850430095     /* d0[1] */
+    .short     -FIX_1_272758580    /* d0[2] */
+    .short     FIX_3_624509785     /* d0[3] */
+
+.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
+    sshll      v28.4s, \x4,    #15
+    smull      v26.4s, \x6,    v0.4h[3]
+    smlal      v26.4s, \x10,   v0.4h[2]
+    smlal      v26.4s, \x12,   v0.4h[1]
+    smlal      v26.4s, \x16,   v0.4h[0]
+
+    add        v20.4s, v28.4s, v26.4s
+    sub        v28.4s, v28.4s, v26.4s
+
+.if \shift > 16
+    srshr      v20.4s, v20.4s, #\shift
+    srshr      v28.4s, v28.4s, #\shift
+    xtn        \y26,   v20.4s
+    xtn        \y27,   v28.4s
+.else
+    rshrn      \y26,   v20.4s, #\shift
+    rshrn      \y27,   v28.4s, #\shift
+.endif
+
+.endm
+
+asm_function jsimd_idct_2x2_neon
+
+    DCT_TABLE       .req x0
+    COEF_BLOCK      .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_COL      .req x3
+    TMP1            .req x0
+    TMP2            .req x15
+
+    /* vpush           {v8.4h-v15.4h}            ; not available */
+    sub             sp, sp, #32
+    st1             {v8.4h-v11.4h}, [sp]  /* save NEON registers */
+    sub             sp, sp, #32
+    st1             {v12.4h-v15.4h}, [sp]
+
+    /* Load constants */
+    adr             TMP2, jsimd_idct_2x2_neon_consts
+    ld1             {v0.4h}, [TMP2]
+
+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | v4.4h   | v5.4h
+     *   1 | v6.4h   | v7.4h
+     *   2 | -       | -
+     *   3 | v10.4h  | v11.4h
+     *   4 | -       | -
+     *   5 | v12.4h  | v13.4h
+     *   6 | -       | -
+     *   7 | v16.4h  | v17.4h
+     */
+    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
+    add             COEF_BLOCK, COEF_BLOCK, #16
+    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
+    /* Dequantize */
+    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
+    mul             v4.8h, v4.8h, v18.8h
+    mul             v5.8h, v5.8h, v18.8h
+    ins             v4.2d[1], v5.2d[0]
+    mul             v6.8h, v6.8h, v20.8h
+    mul             v7.8h, v7.8h, v21.8h
+    ins             v6.2d[1], v7.2d[0]
+    add             DCT_TABLE, DCT_TABLE, #16
+    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
+    mul             v10.8h, v10.8h, v24.8h
+    mul             v11.8h, v11.8h, v25.8h
+    ins             v10.2d[1], v11.2d[0]
+    add             DCT_TABLE, DCT_TABLE, #16
+    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
+    mul             v12.8h, v12.8h, v26.8h
+    mul             v13.8h, v13.8h, v27.8h
+    ins             v12.2d[1], v13.2d[0]
+    add             DCT_TABLE, DCT_TABLE, #16
+    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
+    mul             v16.8h, v16.8h, v30.8h
+    mul             v17.8h, v17.8h, v31.8h
+    ins             v16.2d[1], v17.2d[0]
+
+    /* Pass 1 */
+#if 0
+    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
+    transpose_4x4   v4.4h, v6.4h, v8.4h,  v10.4h
+    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
+    transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
+#else
+    smull           v26.4s, v6.4h,  v0.4h[3]
+    smlal           v26.4s, v10.4h, v0.4h[2]
+    smlal           v26.4s, v12.4h, v0.4h[1]
+    smlal           v26.4s, v16.4h, v0.4h[0]
+    smull           v24.4s, v7.4h,  v0.4h[3]
+    smlal           v24.4s, v11.4h, v0.4h[2]
+    smlal           v24.4s, v13.4h, v0.4h[1]
+    smlal           v24.4s, v17.4h, v0.4h[0]
+    sshll           v28.4s, v4.4h,  #15
+    sshll           v30.4s, v5.4h,  #15
+    add             v20.4s, v28.4s, v26.4s
+    sub             v28.4s, v28.4s, v26.4s
+    rshrn           v4.4h,  v20.4s, #13
+    rshrn           v6.4h,  v28.4s, #13
+    add             v20.4s, v30.4s, v24.4s
+    sub             v28.4s, v30.4s, v24.4s
+    rshrn           v5.4h,  v20.4s, #13
+    rshrn           v7.4h,  v28.4s, #13
+    transpose       v4, v6, v3, .16b, .8h
+    transpose       v6, v10, v3, .16b, .4s
+#endif
+
+    /* Pass 2 */
+    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
+
+    /* Range limit */
+    movi            v30.8h, #0x80
+    ins             v26.2d[1], v27.2d[0]
+    add             v26.8h, v26.8h, v30.8h
+    sqxtun          v30.8b, v26.8h
+    ins             v26.2d[0], v30.2d[0]
+    sqxtun          v27.8b, v26.8h
+
+    /* Store results to the output buffer */
+    ldp             TMP1, TMP2, [OUTPUT_BUF]
+    add             TMP1, TMP1, OUTPUT_COL
+    add             TMP2, TMP2, OUTPUT_COL
+
+    st1             {v26.b}[0], [TMP1], 1
+    st1             {v27.b}[4], [TMP1], 1
+    st1             {v26.b}[1], [TMP2], 1
+    st1             {v27.b}[5], [TMP2], 1
+
+    /* vpop            {v8.4h-v15.4h} ;not available */
+
+    ld1             {v12.4h-v15.4h}, [sp], 32
+    ld1             {v8.4h-v11.4h}, [sp], 32
+
+    blr             x30
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+.endfunc
+
+.purgem idct_helper
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_ycc_extrgb_convert_neon
+ * jsimd_ycc_extbgr_convert_neon
+ * jsimd_ycc_extrgbx_convert_neon
+ * jsimd_ycc_extbgrx_convert_neon
+ * jsimd_ycc_extxbgr_convert_neon
+ * jsimd_ycc_extxrgb_convert_neon
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+
+.macro do_load size
+    .if \size == 8
+        ld1  {v4.8b}, [U], 8
+        ld1  {v5.8b}, [V], 8
+        ld1  {v0.8b}, [Y], 8
+        prfm PLDL1KEEP, [U, #64]
+        prfm PLDL1KEEP, [V, #64]
+        prfm PLDL1KEEP, [Y, #64]
+    .elseif \size == 4
+        ld1  {v4.b}[0], [U]
+        ld1  {v4.b}[1], [U]
+        ld1  {v4.b}[2], [U]
+        ld1  {v4.b}[3], [U]
+        ld1  {v5.b}[0], [V]
+        ld1  {v5.b}[1], [V], 1
+        ld1  {v5.b}[2], [V], 1
+        ld1  {v5.b}[3], [V], 1
+        ld1  {v0.b}[0], [Y], 1
+        ld1  {v0.b}[1], [Y], 1
+        ld1  {v0.b}[2], [Y], 1
+        ld1  {v0.b}[3], [Y], 1
+    .elseif \size == 2
+        ld1  {v4.b}[4], [U], 1
+        ld1  {v4.b}[5], [U], 1
+        ld1  {v5.b}[4], [V], 1
+        ld1  {v5.b}[5], [V], 1
+        ld1  {v0.b}[4], [Y], 1
+        ld1  {v0.b}[5], [Y], 1
+    .elseif \size == 1
+        ld1  {v4.b}[6], [U], 1
+        ld1  {v5.b}[6], [V], 1
+        ld1  {v0.b}[6], [Y], 1
+    .else
+        .error unsupported macroblock size
+    .endif
+.endm
+
+.macro do_store bpp, size
+    .if \bpp == 24
+        .if \size == 8
+            st3  {v10.8b, v11.8b, v12.8b}, [RGB], 24
+        .elseif \size == 4
+            st3  {v10.b, v11.b, v12.b}[0], [RGB], 3
+            st3  {v10.b, v11.b, v12.b}[1], [RGB], 3
+            st3  {v10.b, v11.b, v12.b}[2], [RGB], 3
+            st3  {v10.b, v11.b, v12.b}[3], [RGB], 3
+        .elseif \size == 2
+            st3  {v10.b, v11.b, v12.b}[4], [RGB], 3
+            st3  {v10.b, v11.b, v12.b}[4], [RGB], 3
+        .elseif \size == 1
+            st3  {v10.b, v11.b, v12.b}[6], [RGB], 3
+        .else
+            .error unsupported macroblock size
+        .endif
+    .elseif \bpp == 32
+        .if \size == 8
+            st4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
+        .elseif \size == 4
+            st4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
+            st4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
+            st4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
+            st4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
+        .elseif \size == 2
+            st4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
+            st4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
+        .elseif \size == 1
+            st4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
+        .else
+            .error unsupported macroblock size
+        .endif
+    .else
+        .error unsupported bpp
+    .endif
+.endm
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
+#else
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize
+#endif
+/*
+ * 2 stage pipelined YCbCr->RGB conversion
+ */
+
+.macro do_yuv_to_rgb_stage1
+    uaddw        v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
+    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
+    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
+    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
+    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
+    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
+    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
+    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
+    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb_stage2
+    rshrn        v20.4h, v20.4s, #15
+    rshrn2       v20.8h, v22.4s, #15
+    rshrn        v24.4h, v24.4s, #14
+    rshrn2       v24.8h, v26.4s, #14
+    rshrn        v28.4h, v28.4s, #14
+    rshrn2       v28.8h, v30.4s, #14
+    uaddw        v20.8h, v20.8h, v0.8b
+    uaddw        v24.8h, v24.8h, v0.8b
+    uaddw        v28.8h, v28.8h, v0.8b
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+    sqxtun       v1\g_offs\defsize, v20.8h
+    sqxtun       v1\r_offs\defsize, v24.8h
+    sqxtun       v1\b_offs\defsize, v28.8h
+
+#else
+    sqxtun       v1\g_offs\gsize, v20.4s
+    sqxtun       v1\r_offs\rsize, v24.4s
+    sqxtun       v1\b_offs\bsize, v28.4s
+#endif
+.endm
+
+.macro do_yuv_to_rgb_stage2_store_load_stage1
+    ld1          {v4.8b}, [U], 8
+    rshrn        v20.4h, v20.4s, #15
+    rshrn2       v20.8h, v22.4s, #15
+    rshrn        v24.4h, v24.4s, #14
+    rshrn2       v24.8h, v26.4s, #14
+    rshrn        v28.4h, v28.4s, #14
+    ld1          {v5.8b}, [V], 8
+    rshrn2       v28.8h, v30.4s, #14
+    uaddw        v20.8h, v20.8h, v0.8b
+    uaddw        v24.8h, v24.8h, v0.8b
+    uaddw        v28.8h, v28.8h, v0.8b
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+    sqxtun       v1\g_offs\defsize, v20.8h
+#else
+    sqxtun       v1\g_offs\gsize, v20.4s
+#endif
+    ld1          {v0.8b}, [Y], 8
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+    sqxtun       v1\r_offs\defsize, v24.8h
+#else
+    sqxtun       v1\r_offs\rsize, v24.4s
+#endif
+    prfm         PLDL1KEEP, [U, #64]
+    prfm         PLDL1KEEP, [V, #64]
+    prfm         PLDL1KEEP, [Y, #64]
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+    sqxtun       v1\b_offs\defsize, v28.8h
+#else
+    sqxtun       v1\b_offs\gsize, v28.4s
+#endif
+    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    do_store     \bpp, 8
+    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
+    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
+    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
+    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
+    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
+    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
+    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
+    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb
+    do_yuv_to_rgb_stage1
+    do_yuv_to_rgb_stage2
+.endm
+
+/* Apple gas crashes on adrl, work around that by using adr.
+ * But this requires a copy of these constants for each function.
+ */
+
+.balign 16
+jsimd_ycc_\colorid\()_neon_consts:
+    .short          0,      0,     0,      0
+    .short          22971, -11277, -23401, 29033
+    .short          -128,  -128,   -128,   -128
+    .short          -128,  -128,   -128,   -128
+
+asm_function jsimd_ycc_\colorid\()_convert_neon
+    OUTPUT_WIDTH    .req x0
+    INPUT_BUF       .req x1
+    INPUT_ROW       .req x2
+    OUTPUT_BUF      .req x3
+    NUM_ROWS        .req x4
+
+    INPUT_BUF0      .req x5
+    INPUT_BUF1      .req x6
+    INPUT_BUF2      .req INPUT_BUF
+
+    RGB             .req x7
+    Y               .req x8
+    U               .req x9
+    V               .req x10
+    N               .req x15
+
+    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
+    adr             x15, jsimd_ycc_\colorid\()_neon_consts
+    ld1             {v0.4h, v1.4h}, [x15], 16
+    ld1             {v2.8h}, [x15]
+
+    /* Save ARM registers and handle input arguments */
+    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
+    stp             x4, x5, [sp,-16]!
+    stp             x6, x7, [sp,-16]!
+    stp             x8, x9, [sp,-16]!
+    stp             x10, x30, [sp,-16]!
+    ldr             INPUT_BUF0, [INPUT_BUF]
+    ldr             INPUT_BUF1, [INPUT_BUF, 8]
+    ldr             INPUT_BUF2, [INPUT_BUF, 16]
+    .unreq          INPUT_BUF
+
+    /* Save NEON registers */
+    /* vpush           {v8.4h-v15.4h} */
+    sub             sp, sp, #32
+    st1             {v8.4h-v11.4h}, [sp]
+    sub             sp, sp, #32
+    st1             {v12.4h-v15.4h}, [sp]
+
+    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
+    movi            v10.16b, #255
+    movi            v12.16b, #255
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    blt             9f
+0:
+    lsl             x16, INPUT_ROW, #3
+    ldr             Y, [INPUT_BUF0, x16]
+    ldr             U, [INPUT_BUF1, x16]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [INPUT_BUF2, x16]
+    add             INPUT_ROW, INPUT_ROW, #1
+    ldr             RGB, [OUTPUT_BUF], #8
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    blt             3f
+    do_load         8
+    do_yuv_to_rgb_stage1
+    subs            N, N, #8
+    blt             2f
+1:
+    do_yuv_to_rgb_stage2_store_load_stage1
+    subs            N, N, #8
+    bge             1b
+2:
+    do_yuv_to_rgb_stage2
+    do_store        \bpp, 8
+    tst             N, #7
+    beq             8f
+3:
+    tst             N, #4
+    beq             3f
+    do_load         4
+3:
+    tst             N, #2
+    beq             4f
+    do_load         2
+4:
+    tst             N, #1
+    beq             5f
+    do_load         1
+5:
+    do_yuv_to_rgb
+    tst             N, #4
+    beq             6f
+    do_store        \bpp, 4
+6:
+    tst             N, #2
+    beq             7f
+    do_store        \bpp, 2
+7:
+    tst             N, #1
+    beq             8f
+    do_store        \bpp, 1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    bgt             0b
+9:
+    /* Restore all registers and return */
+    /* vpop            {v8.4h-v15.4h} */
+    ld1             {v12.4h-v15.4h}, [sp], #32
+    ld1             {v8.4h-v11.4h}, [sp], #32
+    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
+    ldp             x10, x30, [sp], #16
+    ldp             x8, x9, [sp], #16
+    ldp             x6, x5, [sp], #16
+    ldp             x4, x5, [sp], #16
+    br              x30
+    .unreq          OUTPUT_WIDTH
+    .unreq          INPUT_ROW
+    .unreq          OUTPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          INPUT_BUF0
+    .unreq          INPUT_BUF1
+    .unreq          INPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+.endfunc
+
+.purgem do_yuv_to_rgb
+.purgem do_yuv_to_rgb_stage1
+.purgem do_yuv_to_rgb_stage2
+.purgem do_yuv_to_rgb_stage2_store_load_stage1
+.endm
+
+/* RTSM simulator fix integer saturation works on 8b boundry add a new parameter
+ * as a workaround for the simulator fix
+ */
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+/*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  defsize   */
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h,   .8b
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h,   .8b
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h,   .8b
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
+#else
+/*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  */
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h
+#endif
+
+.purgem do_load
+.purgem do_store
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
index e96f5b8..b731edb 100644
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -2,7 +2,7 @@
  * jsimd_i386.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011 D. R. Commander
+ * Copyright 2009-2011, 2013 D. R. Commander
  * 
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -59,6 +59,9 @@
   env = getenv("JSIMD_FORCESSE2");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support &= JSIMD_SSE2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
 }
 
 GLOBAL(int)
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
new file mode 100644
index 0000000..5b4f893
--- /dev/null
+++ b/simd/jsimd_mips.c
@@ -0,0 +1,857 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011 D. R. Commander
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on
+ * MIPS architecture.
+ *
+ * Based on the stubs from 'jsimd_none.c'
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+LOCAL(int)
+parse_proc_cpuinfo(const char* search_string)
+{
+  const char* file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE* f = NULL;
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_MIPS_DSPR2;
+        return 1;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+  return 0;
+}
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_MIPS_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  if (!parse_proc_cpuinfo("MIPS 74K"))
+    return;
+#endif
+}
+static const int mips_idct_ifast_coefs[4] = {
+  0x45404540,           // FIX( 1.082392200 / 2) =  17734 = 0x4546
+  0x5A805A80,           // FIX( 1.414213562 / 2) =  23170 = 0x5A82
+  0x76407640,           // FIX( 1.847759065 / 2) =  30274 = 0x7642
+  0xAC60AC60            // FIX(-2.613125930 / 4) = -21407 = 0xAC61
+};
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
+
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
+      break;
+    default:
+      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_extrgbx_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_extbgr_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_extbgrx_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_extxbgr_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_extxrgb_gray_convert_mips_dspr2;
+      break;
+    default:
+      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
+      break;
+  default:
+      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width,
+        cinfo->max_v_samp_factor, compptr->v_samp_factor,
+        compptr->width_in_blocks, input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width,
+        cinfo->max_v_samp_factor, compptr->v_samp_factor,
+        compptr->width_in_blocks, input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_fdct_islow_mips_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_fdct_ifast_mips_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12 (void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    jsimd_idct_2x2_mips_dspr2(compptr->dct_table, coef_block,
+                              output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+  {
+    int workspace[DCTSIZE*4];  /* buffers data between passes */
+    jsimd_idct_4x4_mips_dspr2(compptr->dct_table, coef_block,
+                              output_buf, output_col, workspace);
+  }
+}
+
+GLOBAL(void)
+jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+           JCOEFPTR coef_block, JSAMPARRAY output_buf,
+           JDIMENSION output_col)
+{
+    if ((simd_support & JSIMD_MIPS_DSPR2))
+      jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block,
+                                output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                  JCOEFPTR coef_block,
+                  JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2) {
+    int workspace[96];
+    int output[12] = {
+      (int)(output_buf[0] + output_col),
+      (int)(output_buf[1] + output_col),
+      (int)(output_buf[2] + output_col),
+      (int)(output_buf[3] + output_col),
+      (int)(output_buf[4] + output_col),
+      (int)(output_buf[5] + output_col),
+      (int)(output_buf[6] + output_col),
+      (int)(output_buf[7] + output_col),
+      (int)(output_buf[8] + output_col),
+      (int)(output_buf[9] + output_col),
+      (int)(output_buf[10] + output_col),
+      (int)(output_buf[11] + output_col),
+    };
+    jsimd_idct_12x12_pass1_mips_dspr2(coef_block,
+                                      compptr->dct_table, workspace);
+    jsimd_idct_12x12_pass2_mips_dspr2(workspace, output);
+  }
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2) {
+    JCOEFPTR inptr;
+    IFAST_MULT_TYPE * quantptr;
+    DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
+
+    /* Pass 1: process columns from input, store into work array. */
+
+    inptr = coef_block;
+    quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+
+    jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr,
+                                     workspace, mips_idct_ifast_coefs);
+
+    /* Pass 2: process rows from work array, store into output array. */
+    /* Note that we must descale the results by a factor of 8 == 2**3, */
+    /* and also undo the PASS1_BITS scaling. */
+
+    jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf,
+                                     output_col, mips_idct_ifast_coefs);
+  }
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
new file mode 100644
index 0000000..198c349
--- /dev/null
+++ b/simd/jsimd_mips_dspr2.S
@@ -0,0 +1,3324 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_mips_dspr2_asm.h"
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_mips_dspr2
+ * jsimd_extbgr_ycc_convert_mips_dspr2
+ * jsimd_extrgbx_ycc_convert_mips_dspr2
+ * jsimd_extbgrx_ycc_convert_mips_dspr2
+ * jsimd_extxbgr_ycc_convert_mips_dspr2
+ * jsimd_extxrgb_ycc_convert_mips_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC r,    \
+                     g,    \
+                     b,    \
+                     inptr
+    lbu     \r, \r_offs(\inptr)
+    lbu     \g, \g_offs(\inptr)
+    lbu     \b, \b_offs(\inptr)
+    addiu   \inptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - output_buf
+ * a3     - output_row
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw      t7, 48(sp)        // t7 = num_rows
+    li      s0, 0x4c8b        // FIX(0.29900)
+    li      s1, 0x9646        // FIX(0.58700)
+    li      s2, 0x1d2f        // FIX(0.11400)
+    li      s3, 0xffffd4cd    // -FIX(0.16874)
+    li      s4, 0xffffab33    // -FIX(0.33126)
+    li      s5, 0x8000        // FIX(0.50000)
+    li      s6, 0xffff94d1    // -FIX(0.41869)
+    li      s7, 0xffffeb2f    // -FIX(0.08131)
+    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
+
+0:
+    addiu   t7, -1            // --num_rows
+    lw      t6, 0(a1)         // t6 = input_buf[0]
+    lw      t0, 0(a2)
+    lw      t1, 4(a2)
+    lw      t2, 8(a2)
+    sll     t3, a3, 2
+    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
+    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
+    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
+
+    addu    t9, t2, a0        // t9 = end address
+    addiu   a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo    s5, $ac0
+    mtlo    t8, $ac1
+    mtlo    t8, $ac2
+    maddu   $ac0, s2, t5
+    maddu   $ac1, s5, t5
+    maddu   $ac2, s5, t3
+    maddu   $ac0, s0, t3
+    maddu   $ac1, s3, t3
+    maddu   $ac2, s6, t4
+    maddu   $ac0, s1, t4
+    maddu   $ac1, s4, t4
+    maddu   $ac2, s7, t5
+    extr.w  t3, $ac0, 16
+    extr.w  t4, $ac1, 16
+    extr.w  t5, $ac2, 16
+    sb      t3, 0(t0)
+    sb      t4, 0(t1)
+    sb      t5, 0(t2)
+    addiu   t0, 1
+    addiu   t2, 1
+    bne     t2, t9, 1b
+     addiu  t1, 1
+    bgtz    t7, 0b
+     addiu  a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*------------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_mips_dspr2
+ * jsimd_ycc_extbgr_convert_mips_dspr2
+ * jsimd_ycc_extrgbx_convert_mips_dspr2
+ * jsimd_ycc_extbgrx_convert_mips_dspr2
+ * jsimd_ycc_extxbgr_convert_mips_dspr2
+ * jsimd_ycc_extxrgb_convert_mips_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 \
+                         scratch1 \
+                         scratch2 \
+                         outptr
+    sb       \scratch0, \r_offs(\outptr)
+    sb       \scratch1, \g_offs(\outptr)
+    sb       \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li       t0, 0xFF
+    sb       t0, \a_offs(\outptr)
+.endif
+    addiu    \outptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - input_row
+ * a3     - output_buf
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw         s1, 48(sp)
+    li         t3, 0x8000
+    li         t4, 0x166e9     // FIX(1.40200)
+    li         t5, 0x1c5a2     // FIX(1.77200)
+    li         t6, 0xffff492e  // -FIX(0.71414)
+    li         t7, 0xffffa7e6  // -FIX(0.34414)
+    repl.ph    t8, 128
+
+0:
+    lw         s0, 0(a3)
+    lw         t0, 0(a1)
+    lw         t1, 4(a1)
+    lw         t2, 8(a1)
+    sll        s5, a2, 2
+    addiu      s1, -1
+    lwx        s2, s5(t0)
+    lwx        s3, s5(t1)
+    lwx        s4, s5(t2)
+    addu       t9, s2, a0
+    addiu      a2, 1
+
+1:
+    lbu        s7, 0(s4)       // cr
+    lbu        s6, 0(s3)       // cb
+    lbu        s5, 0(s2)       // y
+    addiu      s2, 1
+    addiu      s4, 1
+    addiu      s7, -128
+    addiu      s6, -128
+    mul        t2, t7, s6
+    mul        t0, t6, s7      // Crgtab[cr]
+    sll        s7, 15
+    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
+    sll        s6, 15
+    addu       t2, t3          // Cbgtab[cb]
+    addu       t2, t0
+
+    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
+    sra        t2, 16
+    addu       t1, s5
+    addu       t2, s5          // add y
+    ins        t2, t1, 16, 16
+    subu.ph    t2, t2, t8
+    addu       t0, s5
+    shll_s.ph  t2, t2, 8
+    subu       t0, 128
+    shra.ph    t2, t2, 8
+    shll_s.w   t0, t0, 24
+    addu.ph    t2, t2, t8      // clip & store
+    sra        t0, t0, 24
+    sra        t1, t2, 16
+    addiu      t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne        s2, t9, 1b
+     addiu     s3, 1
+    bgtz       s1, 0b
+     addiu     a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*------------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_mips_dspr2
+ * jsimd_extbgr_gray_convert_mips_dspr2
+ * jsimd_extrgbx_gray_convert_mips_dspr2
+ * jsimd_extbgrx_gray_convert_mips_dspr2
+ * jsimd_extxbgr_gray_convert_mips_dspr2
+ * jsimd_extxrgb_gray_convert_mips_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY r,    \
+                      g,    \
+                      b,    \
+                      inptr
+    lbu     \r, \r_offs(\inptr)
+    lbu     \g, \g_offs(\inptr)
+    lbu     \b, \b_offs(\inptr)
+    addiu   \inptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - output_buf
+ * a3     - output_row
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    li      s0, 0x4c8b             // s0 = FIX(0.29900)
+    li      s1, 0x9646             // s1 = FIX(0.58700)
+    li      s2, 0x1d2f             // s2 = FIX(0.11400)
+    li      s7, 0x8000             // s7 = FIX(0.50000)
+    lw      s6, 48(sp)
+    andi    t7, a0, 3
+
+0:
+    addiu   s6, -1                 // s6 = num_rows
+    lw      t0, 0(a1)
+    lw      t1, 0(a2)
+    sll     t3, a3, 2
+    lwx     t1, t3(t1)
+    addiu   a3, 1
+    addu    t9, t1, a0
+    subu    t8, t9, t7
+    beq     t1, t8, 2f
+     nop
+
+1:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo    s7, $ac0
+    maddu   $ac0, s2, t5
+    maddu   $ac0, s1, t4
+    maddu   $ac0, s0, t3
+    mtlo    s7, $ac1
+    maddu   $ac1, s2, s5
+    maddu   $ac1, s1, s4
+    maddu   $ac1, s0, s3
+    extr.w  t6, $ac0, 16
+
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo    s7, $ac0
+    maddu   $ac0, s2, t5
+    maddu   $ac0, s1, t4
+    extr.w  t2, $ac1, 16
+    maddu   $ac0, s0, t3
+    mtlo    s7, $ac1
+    maddu   $ac1, s2, s5
+    maddu   $ac1, s1, s4
+    maddu   $ac1, s0, s3
+    extr.w  t5, $ac0, 16
+    sb      t6, 0(t1)
+    sb      t2, 1(t1)
+    extr.w  t3, $ac1, 16
+    addiu   t1, 4
+    sb      t5, -2(t1)
+    sb      t3, -1(t1)
+    bne     t1, t8, 1b
+     nop
+
+2:
+    beqz    t7, 4f
+     nop
+
+3:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+
+    mtlo    s7, $ac0
+    maddu   $ac0, s2, t5
+    maddu   $ac0, s1, t4
+    maddu   $ac0, s0, t3
+    extr.w  t6, $ac0, 16
+    sb      t6, 0(t1)
+    addiu   t1, 1
+    bne     t1, t9, 3b
+     nop
+
+4:
+    bgtz    s6, 0b
+     addiu  a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_\colorid\()_gray_convert_mips_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*------------------------------------------id --  pix R  G  B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_mips_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - downsampled_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    li             s4, 0
+    lw             s2, 0(a3)       // s2 = *output_data_ptr
+0:
+    li             t9, 2
+    lw             s1, -4(a2)      // s1 = inptr1
+
+1:
+    lw             s0, 0(a2)       // s0 = inptr0
+    lwx            s3, s4(s2)
+    addiu          s5, a1, -2      // s5 = downsampled_width - 2
+    srl            t4, s5, 1
+    sll            t4, t4, 1
+    lbu            t0, 0(s0)
+    lbu            t1, 1(s0)
+    lbu            t2, 0(s1)
+    lbu            t3, 1(s1)
+    addiu          s0, 2
+    addiu          s1, 2
+    addu           t8, s0, t4      // t8 = end address
+    andi           s5, s5, 1       // s5 = residual
+    sll            t4, t0, 1
+    sll            t6, t1, 1
+    addu           t0, t0, t4      // t0 = (*inptr0++) * 3
+    addu           t1, t1, t6      // t1 = (*inptr0++) * 3
+    addu           t7, t0, t2      // t7 = thiscolsum
+    addu           t6, t1, t3      // t5 = nextcolsum
+    sll            t0, t7, 2       // t0 = thiscolsum * 4
+    subu           t1, t0, t7      // t1 = thiscolsum * 3
+    shra_r.w       t0, t0, 4
+    addiu          t1, 7
+    addu           t1, t1, t6
+    srl            t1, t1, 4
+    sb             t0, 0(s3)
+    sb             t1, 1(s3)
+    addiu          s3, 2
+2:
+    lh             t0, 0(s0)       // t0 = A3|A2
+    lh             t2, 0(s1)       // t2 = B3|B2
+    addiu          s0, 2
+    addiu          s1, 2
+    preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
+    preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
+    shll.ph        t1, t0, 1
+    sll            t3, t6, 1
+    addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
+    addu           t3, t3, t6      // t3 = this * 3
+    addu.ph        t0, t0, t2      // t0 = next2|next1
+    addu           t1, t3, t7
+    andi           t7, t0, 0xFFFF  // t7 = next1
+    sll            t2, t7, 1
+    addu           t2, t7, t2      // t2 = next1*3
+    addu           t4, t2, t6
+    srl            t6, t0, 16      // t6 = next2
+    shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
+    addu           t0, t3, t7
+    addiu          t0, 7
+    srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
+    shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
+    addu           t2, t2, t6
+    addiu          t2, 7
+    srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    sb             t4, 2(s3)
+    sb             t2, 3(s3)
+    bne            t8, s0, 2b
+     addiu         s3, 4
+    beqz           s5, 4f
+     addu          t8, s0, s5
+3:
+    lbu            t0, 0(s0)
+    lbu            t2, 0(s1)
+    addiu          s0, 1
+    addiu          s1, 1
+    sll            t3, t6, 1
+    sll            t1, t0, 1
+    addu           t1, t0, t1      // t1 = inptr0 * 3
+    addu           t3, t3, t6      // t3 = thiscolsum * 3
+    addu           t5, t1, t2
+    addu           t1, t3, t7
+    shra_r.w       t1, t1, 4
+    addu           t0, t3, t5
+    addiu          t0, 7
+    srl            t0, t0, 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    addiu          s3, 2
+    move           t7, t6
+    bne            t8, s0, 3b
+     move          t6, t5
+4:
+    sll            t0, t6, 2       // t0 = thiscolsum * 4
+    subu           t1, t0, t6      // t1 = thiscolsum * 3
+    addu           t1, t1, t7
+    addiu          s4, 4
+    shra_r.w       t1, t1, 4
+    addiu          t0, 7
+    srl            t0, t0, 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    addiu          t9, -1
+    addiu          s3, 2
+    bnez           t9, 1b
+     lw            s1, 4(a2)
+    srl            t0, s4, 2
+    subu           t0, a0, t0
+    bgtz           t0, 0b
+     addiu         a2, 4
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j ra
+     nop
+END(jsimd_h2v2_fancy_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - downsampled_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    .set at
+
+    beqz           a0, 3f
+     sll           t0, a0, 2
+    lw             s1, 0(a3)
+    addu           s0, s1, t0
+    li             s3, 0x10001
+0:
+    addiu          t8, a1, -2
+    srl            t9, t8, 2
+    lw             t7, 0(a2)
+    lw             s2, 0(s1)
+    lbu            t0, 0(t7)
+    lbu            t1, 1(t7)   // t1 = inptr[1]
+    sll            t2, t0, 1
+    addu           t2, t2, t0  // t2 = invalue*3
+    addu           t2, t2, t1
+    shra_r.w       t2, t2, 2
+    sb             t0, 0(s2)
+    sb             t2, 1(s2)
+    beqz           t9, 11f
+     addiu         s2, 2
+1:
+    ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
+    ulw            t1, 1(t7)
+    ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
+    preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
+    preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
+    preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
+    preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
+    preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
+    shll.ph        t5, t4, 1
+    shll.ph        t6, t1, 1
+    addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
+    addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
+    addu.ph        t4, t3, s3
+    addu.ph        t0, t0, s3
+    addu.ph        t4, t4, t5
+    addu.ph        t0, t0, t6
+    shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
+    shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
+    addu.ph        t2, t2, t5
+    addu.ph        t3, t3, t6
+    shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
+    shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
+    shll.ph        t2, t2, 8
+    shll.ph        t3, t3, 8
+    or             t2, t4, t2
+    or             t3, t3, t0
+    addiu          t9, -1
+    usw            t3, 0(s2)
+    usw            t2, 4(s2)
+    addiu          s2, 8
+    bgtz           t9, 1b
+     addiu         t7, 4
+11:
+    andi           t8, 3
+    beqz           t8, 3f
+     addiu         t7, 1
+2:
+    lbu            t0, 0(t7)
+    addiu          t7, 1
+    sll            t1, t0, 1
+    addu           t2, t0, t1  // t2 = invalue
+    lbu            t3, -2(t7)
+    lbu            t4, 0(t7)
+    addiu          t3, 1
+    addiu          t4, 2
+    addu           t3, t3, t2
+    addu           t4, t4, t2
+    srl            t3, 2
+    srl            t4, 2
+    sb             t3, 0(s2)
+    sb             t4, 1(s2)
+    addiu          t8, -1
+    bgtz           t8, 2b
+     addiu         s2, 2
+
+    lbu            t0, 0(t7)
+    lbu            t2, -1(t7)
+    sll            t1, t0, 1
+    addu           t1, t1, t0 // t1 = invalue * 3
+    addu           t1, t1, t2
+    addiu          t1, 1
+    srl            t1, t1, 2
+    sb             t1, 0(s2)
+    sb             t0, 1(s2)
+    addiu          s1, 4
+    bne            s1, s0, 0b
+     addiu         a2, 4
+3:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j              ra
+     nop
+END(jsimd_h2v1_fancy_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - cinfo->max_v_samp_factor
+ * a2     - compptr->v_samp_factor
+ * a3     - compptr->width_in_blocks
+ * 16(sp) - input_data
+ * 20(sp) - output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+    beqz        a2, 7f
+     lw         s1, 44(sp)  // s1 = output_data
+    lw          s0, 40(sp)  // s0 = input_data
+    srl         s2, a0, 2
+    andi        t9, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3   // t0 = width_in_blocks*DCT
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    andi        t6, a0, 1   // t6 = temp_index
+    addiu       t6, -1
+    lw          t4, 0(s1)   // t4 = outptr
+    lw          t5, 0(s0)   // t5 = inptr0
+    li          s3, 0       // s3 = bias
+    srl         t7, a0, 1   // t7 = image_width1
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+1:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 2(t5)
+    ulhu        t2, 4(t5)
+    ulhu        t3, 6(t5)
+    raddu.w.qb  t0, t0
+    raddu.w.qb  t1, t1
+    raddu.w.qb  t2, t2
+    raddu.w.qb  t3, t3
+    shra.ph     t0, t0, 1
+    shra_r.ph   t1, t1, 1
+    shra.ph     t2, t2, 1
+    shra_r.ph   t3, t3, 1
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t3, 3(t4)
+    addiu       s4, -1
+    addiu       t4, 4
+    bgtz        s4, 1b
+     addiu      t5, 8
+    beqz        t8, 3f
+     addu       s4, t4, t8
+2:
+    ulhu        t0, 0(t5)
+    raddu.w.qb  t0, t0
+    addqh.w     t0, t0, s3
+    xori        s3, s3, 1
+    sb          t0, 0(t4)
+    addiu       t4, 1
+    bne         t4, s4, 2b
+     addiu      t5, 2
+3:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    addqh.w     t2, t1, s3  // t2 = pixval1
+    xori        s3, s3, 1
+    addqh.w     t3, t1, s3  // t3 = pixval2
+    blez        s2, 5f
+     append     t3, t2,  8
+    addu        t5, t4, s2  // t5 = loop_end2
+4:
+    ush         t3, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 4b
+     addiu      t4,  2
+5:
+    beqz        t9, 6f
+     nop
+    sb          t2, 0(t4)
+6:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 4
+7:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+    j           ra
+    nop
+END(jsimd_h2v1_downsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
+
+/*
+ * a0     - cinfo->image_width
+ * a1     - cinfo->max_v_samp_factor
+ * a2     - compptr->v_samp_factor
+ * a3     - compptr->width_in_blocks
+ * 16(sp) - input_data
+ * 20(sp) - output_data
+ */
+    .set at
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    beqz         a2, 8f
+     lw          s1, 52(sp)      // s1 = output_data
+    lw           s0, 48(sp)      // s0 = input_data
+
+    andi         t6, a0, 1       // t6 = temp_index
+    addiu        t6, -1
+    srl          t7, a0, 1       // t7 = image_width1
+    srl          s4, t7, 2
+    andi         t8, t7, 3
+    andi         t9, a0, 2
+    srl          s2, a0, 2
+    srl          t7, t9, 1
+    addu         s2, t7, s2
+    sll          t0, a3, 3       // s2 = width_in_blocks*DCT
+    srl          t7, t0, 1
+    subu         s2, t7, s2
+0:
+    lw           t4, 0(s1)       // t4 = outptr
+    lw           t5, 0(s0)       // t5 = inptr0
+    lw           s7, 4(s0)       // s7 = inptr1
+    li           s6, 1           // s6 = bias
+2:
+    ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
+    ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
+    ulw          t2, 4(t5)
+    ulw          t3, 4(s7)
+    precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
+    ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
+    raddu.w.qb   t1, t7
+    raddu.w.qb   t0, t0
+    shra_r.w     t1, t1, 2
+    addiu        t0, 1
+    srl          t0, 2
+    precrq.ph.w  t7, t2, t3
+    ins          t2, t3, 16, 16
+    raddu.w.qb   t7, t7
+    raddu.w.qb   t2, t2
+    shra_r.w     t7, t7, 2
+    addiu        t2, 1
+    srl          t2, 2
+    sb           t0, 0(t4)
+    sb           t1, 1(t4)
+    sb           t2, 2(t4)
+    sb           t7, 3(t4)
+    addiu        t4, 4
+    addiu        t5, 8
+    addiu        s4, s4, -1
+    bgtz         s4, 2b
+     addiu       s7, 8
+    beqz         t8, 4f
+     addu        t8, t4, t8
+3:
+    ulhu         t0, 0(t5)
+    ulhu         t1, 0(s7)
+    ins          t0, t1, 16, 16
+    raddu.w.qb   t0, t0
+    addu         t0, t0, s6
+    srl          t0, 2
+    xori         s6, s6, 3
+    sb           t0, 0(t4)
+    addiu        t5, 2
+    addiu        t4, 1
+    bne          t8, t4, 3b
+     addiu       s7, 2
+4:
+    lbux         t1, t6(t5)
+    sll          t1, 1
+    lbux         t0, t6(s7)
+    sll          t0, 1
+    addu         t1, t1, t0
+    addu         t3, t1, s6
+    srl          t0, t3, 2       // t2 = pixval1
+    xori         s6, s6, 3
+    addu         t2, t1, s6
+    srl          t1, t2, 2       // t3 = pixval2
+    blez         s2, 6f
+     append      t1, t0, 8
+5:
+    ush          t1, 0(t4)
+    addiu        s2, -1
+    bgtz         s2, 5b
+     addiu       t4, 2
+6:
+    beqz         t9, 7f
+     nop
+    sb           t0, 0(t4)
+7:
+    addiu        s1, 4
+    addiu        a2, -1
+    bnez         a2, 0b
+     addiu       s0, 8
+8:
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j            ra
+     nop
+END(jsimd_h2v2_downsample_mips_dspr2)
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - cinfo->output_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+    lw      t7, 0(a3)       // t7 = output_data
+    andi    t8, a1, 0xf     // t8 = residual
+    sll     t0, a0, 2
+    blez    a0, 4f
+     addu   t9, t7, t0      // t9 = output_data end address
+0:
+    lw      t5, 0(t7)       // t5 = outptr
+    lw      t6, 0(a2)       // t6 = inptr
+    addu    t3, t5, a1      // t3 = outptr + output_width (end address)
+    subu    t3, t8          // t3 = end address - residual
+    beq     t5, t3, 2f
+     move   t4, t8
+1:
+    ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
+    ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
+    srl     t1, t0, 16      // t1 = |X|X|P3|P2|
+    ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
+    ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
+    ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
+    ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
+    usw     t0, 0(t5)
+    usw     t1, 4(t5)
+    srl     t0, t2, 16      // t0 = |X|X|P7|P6|
+    ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
+    ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
+    ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
+    ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
+    usw     t2, 8(t5)
+    usw     t0, 12(t5)
+    addiu   t5, 16
+    bne     t5, t3, 1b
+     addiu  t6, 8
+    beqz    t8, 3f
+     move   t4, t8
+2:
+    lbu     t1, 0(t6)
+    sb      t1, 0(t5)
+    sb      t1, 1(t5)
+    addiu   t4, -2
+    addiu   t6, 1
+    bgtz    t4, 2b
+     addiu  t5, 2
+3:
+    addiu   t7, 4
+    bne     t9, t7, 0b
+     addiu  a2, 4
+4:
+    j       ra
+     nop
+END(jsimd_h2v1_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - cinfo->output_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+    lw      t7, 0(a3)
+    blez    a0, 7f
+     andi   t9, a1, 0xf     // t9 = residual
+0:
+    lw      t6, 0(a2)       // t6 = inptr
+    lw      t5, 0(t7)       // t5 = outptr
+    addu    t8, t5, a1      // t8 = outptr end address
+    subu    t8, t9          // t8 = end address - residual
+    beq     t5, t8, 2f
+     move   t4, t9
+1:
+    ulw     t0, 0(t6)
+    srl     t1, t0, 16
+    ins     t0, t0, 16, 16
+    ins     t0, t0, 8, 16
+    ins     t1, t1, 16, 16
+    ins     t1, t1, 8, 16
+    ulw     t2, 4(t6)
+    usw     t0, 0(t5)
+    usw     t1, 4(t5)
+    srl     t3, t2, 16
+    ins     t2, t2, 16, 16
+    ins     t2, t2, 8, 16
+    ins     t3, t3, 16, 16
+    ins     t3, t3, 8, 16
+    usw     t2, 8(t5)
+    usw     t3, 12(t5)
+    addiu   t5, 16
+    bne     t5, t8, 1b
+     addiu  t6, 8
+    beqz    t9, 3f
+     move   t4, t9
+2:
+    lbu     t0, 0(t6)
+    sb      t0, 0(t5)
+    sb      t0, 1(t5)
+    addiu   t4, -2
+    addiu   t6, 1
+    bgtz    t4, 2b
+     addiu  t5, 2
+3:
+    ulw     t6, 0(t7)       // t6 = outptr
+    ulw     t5, 4(t7)       // t5 = outptr[1]
+    addu    t4, t6, a1      // t4 = new end address
+    subu    t8, t4, t9
+    beqz    t8, 5f
+     nop
+4:
+    ulw     t0, 0(t6)
+    ulw     t1, 4(t6)
+    ulw     t2, 8(t6)
+    usw     t0, 0(t5)
+    ulw     t0, 12(t6)
+    usw     t1, 4(t5)
+    usw     t2, 8(t5)
+    usw     t0, 12(t5)
+    addiu   t6, 16
+    bne     t6, t8, 4b
+     addiu  t5, 16
+    beqz    t9, 6f
+     nop
+5:
+    lbu     t0, 0(t6)
+    sb      t0, 0(t5)
+    addiu   t6, 1
+    bne     t6, t4, 5b
+     addiu  t5, 1
+6:
+    addiu   t7, 8
+    addiu   a0, -2
+    bgtz    a0, 0b
+     addiu  a2, 4
+7:
+    j       ra
+     nop
+END(jsimd_h2v2_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
+/*
+ * a0     - inptr
+ * a1     - quantptr
+ * a2     - wsptr
+ * a3     - mips_idct_ifast_coefs
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu          t9, a0, 16            // end address
+    or             AT, a3, zero
+
+0:
+    lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
+    lw             t0, 0(a0)             // inptr[DCTSIZE*0]
+    lw             t1, 16(a0)            // inptr[DCTSIZE*1]
+    muleq_s.w.phl  v0, t0, s0            // tmp0 ...
+    lw             t2, 32(a0)            // inptr[DCTSIZE*2]
+    lw             t3, 48(a0)            // inptr[DCTSIZE*3]
+    lw             t4, 64(a0)            // inptr[DCTSIZE*4]
+    lw             t5, 80(a0)            // inptr[DCTSIZE*5]
+    muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
+    lw             t6, 96(a0)            // inptr[DCTSIZE*6]
+    lw             t7, 112(a0)           // inptr[DCTSIZE*7]
+    or             s4, t1, t2
+    or             s5, t3, t4
+    bnez           s4, 1f
+     ins           t0, v0, 16, 16        // ... tmp0
+    bnez           s5, 1f
+     or            s6, t5, t6
+    or             s6, s6, t7
+    bnez           s6, 1f
+     sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
+    sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
+    sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
+    sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
+    sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
+    sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
+    sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
+    sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
+    addiu          a0, a0, 4
+    b              2f
+     addiu         a1, a1, 4
+
+1:
+    lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
+    lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
+    muleq_s.w.phl  v0, t2, s1            // tmp1 ...
+    muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
+    lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
+    lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
+    lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
+    muleq_s.w.phl  v1, t4, s2            // tmp2 ...
+    muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
+    lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
+    lw             t8, 4(AT)             // FIX(1.414213562)
+    ins            t2, v0, 16, 16        // ... tmp1
+    muleq_s.w.phl  v0, t6, s3            // tmp3 ...
+    muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
+    ins            t4, v1, 16, 16        // ... tmp2
+    addq.ph        s4, t0, t4            // tmp10
+    subq.ph        s5, t0, t4            // tmp11
+    ins            t6, v0, 16, 16        // ... tmp3
+    subq.ph        s6, t2, t6            // tmp12 ...
+    addq.ph        s7, t2, t6            // tmp13
+    mulq_s.ph      s6, s6, t8            // ... tmp12 ...
+    addq.ph        t0, s4, s7            // tmp0
+    subq.ph        t6, s4, s7            // tmp3
+    muleq_s.w.phl  v0, t1, s0            // tmp4 ...
+    muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
+    shll_s.ph      s6, s6, 1             // x2
+    lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
+    subq.ph        s6, s6, s7            // ... tmp12
+    muleq_s.w.phl  v1, t7, s3            // tmp7 ...
+    muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
+    ins            t1, v0, 16, 16        // ... tmp4
+    addq.ph        t2, s5, s6            // tmp1
+    subq.ph        t4, s5, s6            // tmp2
+    muleq_s.w.phl  v0, t5, s2            // tmp6 ...
+    muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
+    ins            t7, v1, 16, 16        // ... tmp7
+    addq.ph        s5, t1, t7            // z11
+    subq.ph        s6, t1, t7            // z12
+    muleq_s.w.phl  v1, t3, s1            // tmp5 ...
+    muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
+    ins            t5, v0, 16, 16        // ... tmp6
+    ins            t3, v1, 16, 16        // ... tmp5
+    addq.ph        s7, t5, t3            // z13
+    subq.ph        v0, t5, t3            // z10
+    addq.ph        t7, s5, s7            // tmp7
+    subq.ph        s5, s5, s7            // tmp11 ...
+    addq.ph        v1, v0, s6            // z5 ...
+    mulq_s.ph      s5, s5, t8            // ... tmp11
+    lw             t8, 8(AT)             // FIX(1.847759065)
+    lw             s4, 0(AT)             // FIX(1.082392200)
+    addq.ph        s0, t0, t7
+    subq.ph        s1, t0, t7
+    mulq_s.ph      v1, v1, t8            // ... z5
+    shll_s.ph      s5, s5, 1             // x2
+    lw             t8, 12(AT)            // FIX(-2.613125930)
+    sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
+    shll_s.ph      v0, v0, 1             // x4
+    mulq_s.ph      v0, v0, t8            // tmp12 ...
+    mulq_s.ph      s4, s6, s4            // tmp10 ...
+    shll_s.ph      v1, v1, 1             // x2
+    addiu          a0, a0, 4
+    addiu          a1, a1, 4
+    sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
+    shll_s.ph      s6, v0, 1             // x4
+    shll_s.ph      s4, s4, 1             // x2
+    addq.ph        s6, s6, v1            // ... tmp12
+    subq.ph        t5, s6, t7            // tmp6
+    subq.ph        s4, s4, v1            // ... tmp10
+    subq.ph        t3, s5, t5            // tmp5
+    addq.ph        s2, t2, t5
+    addq.ph        t1, s4, t3            // tmp4
+    subq.ph        s3, t2, t5
+    sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
+    sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
+    addq.ph        v0, t4, t3
+    subq.ph        v1, t4, t3
+    sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
+    sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
+    addq.ph        v0, t6, t1
+    subq.ph        v1, t6, t1
+    sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
+    sw             v1, 48(a2)            // wsptr[DCTSIZE*3]
+
+2:
+    bne            a0, t9, 0b
+     addiu         a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j              ra
+     nop
+
+END(jsimd_idct_ifast_cols_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
+/*
+ * a0     - wsptr
+ * a1     - output_buf
+ * a2     - output_col
+ * a3     - mips_idct_ifast_coefs
+ */
+
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    addiu         t9, a0, 128        // end address
+    lui           s8, 0x8080
+    ori           s8, s8, 0x8080
+
+0:
+    lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
+    lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
+    lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
+    lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
+    lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
+    lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
+    lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
+    lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
+    lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
+    precrq.ph.w   t1, s0, t0         // B b
+    ins           t0, s0, 16, 16     // A a
+    bnez          t1, 1f
+     or           s0, t2, s2
+    bnez          s0, 1f
+     or           s0, t4, s4
+    bnez          s0, 1f
+     or           s0, t6, s6
+    bnez          s0, 1f
+     shll_s.ph    s0, t0, 2          // A a
+    lw            a3, 0(a1)
+    lw            AT, 4(a1)
+    precrq.ph.w   t0, s0, s0         // A A
+    ins           s0, s0, 16, 16     // a a
+    addu          a3, a3, a2
+    addu          AT, AT, a2
+    precrq.qb.ph  t0, t0, t0         // A A A A
+    precrq.qb.ph  s0, s0, s0         // a a a a
+    addu.qb       s0, s0, s8
+    addu.qb       t0, t0, s8
+    sw            s0, 0(a3)
+    sw            s0, 4(a3)
+    sw            t0, 0(AT)
+    sw            t0, 4(AT)
+    addiu         a0, a0, 32
+    bne           a0, t9, 0b
+     addiu        a1, a1, 8
+    b             2f
+     nop
+
+1:
+    precrq.ph.w   t3, s2, t2
+    ins           t2, s2, 16, 16
+    precrq.ph.w   t5, s4, t4
+    ins           t4, s4, 16, 16
+    precrq.ph.w   t7, s6, t6
+    ins           t6, s6, 16, 16
+    lw            t8, 4(AT)          // FIX(1.414213562)
+    addq.ph       s4, t0, t4         // tmp10
+    subq.ph       s5, t0, t4         // tmp11
+    subq.ph       s6, t2, t6         // tmp12 ...
+    addq.ph       s7, t2, t6         // tmp13
+    mulq_s.ph     s6, s6, t8         // ... tmp12 ...
+    addq.ph       t0, s4, s7         // tmp0
+    subq.ph       t6, s4, s7         // tmp3
+    shll_s.ph     s6, s6, 1          // x2
+    subq.ph       s6, s6, s7         // ... tmp12
+    addq.ph       t2, s5, s6         // tmp1
+    subq.ph       t4, s5, s6         // tmp2
+    addq.ph       s5, t1, t7         // z11
+    subq.ph       s6, t1, t7         // z12
+    addq.ph       s7, t5, t3         // z13
+    subq.ph       v0, t5, t3         // z10
+    addq.ph       t7, s5, s7         // tmp7
+    subq.ph       s5, s5, s7         // tmp11 ...
+    addq.ph       v1, v0, s6         // z5 ...
+    mulq_s.ph     s5, s5, t8         // ... tmp11
+    lw            t8, 8(AT)          // FIX(1.847759065)
+    lw            s4, 0(AT)          // FIX(1.082392200)
+    addq.ph       s0, t0, t7         // tmp0 + tmp7
+    subq.ph       s7, t0, t7         // tmp0 - tmp7
+    mulq_s.ph     v1, v1, t8         // ... z5
+    lw            a3, 0(a1)
+    lw            t8, 12(AT)         // FIX(-2.613125930)
+    shll_s.ph     s5, s5, 1          // x2
+    addu          a3, a3, a2
+    shll_s.ph     v0, v0, 1          // x4
+    mulq_s.ph     v0, v0, t8         // tmp12 ...
+    mulq_s.ph     s4, s6, s4         // tmp10 ...
+    shll_s.ph     v1, v1, 1          // x2
+    addiu         a0, a0, 32
+    addiu         a1, a1, 8
+    shll_s.ph     s6, v0, 1          // x4
+    shll_s.ph     s4, s4, 1          // x2
+    addq.ph       s6, s6, v1         // ... tmp12
+    shll_s.ph     s0, s0, 2
+    subq.ph       t5, s6, t7         // tmp6
+    subq.ph       s4, s4, v1         // ... tmp10
+    subq.ph       t3, s5, t5         // tmp5
+    shll_s.ph     s7, s7, 2
+    addq.ph       t1, s4, t3         // tmp4
+    addq.ph       s1, t2, t5         // tmp1 + tmp6
+    subq.ph       s6, t2, t5         // tmp1 - tmp6
+    addq.ph       s2, t4, t3         // tmp2 + tmp5
+    subq.ph       s5, t4, t3         // tmp2 - tmp5
+    addq.ph       s4, t6, t1         // tmp3 + tmp4
+    subq.ph       s3, t6, t1         // tmp3 - tmp4
+    shll_s.ph     s1, s1, 2
+    shll_s.ph     s2, s2, 2
+    shll_s.ph     s3, s3, 2
+    shll_s.ph     s4, s4, 2
+    shll_s.ph     s5, s5, 2
+    shll_s.ph     s6, s6, 2
+    precrq.ph.w   t0, s1, s0         // B A
+    ins           s0, s1, 16, 16     // b a
+    precrq.ph.w   t2, s3, s2         // D C
+    ins           s2, s3, 16, 16     // d c
+    precrq.ph.w   t4, s5, s4         // F E
+    ins           s4, s5, 16, 16     // f e
+    precrq.ph.w   t6, s7, s6         // H G
+    ins           s6, s7, 16, 16     // h g
+    precrq.qb.ph  t0, t2, t0         // D C B A
+    precrq.qb.ph  s0, s2, s0         // d c b a
+    precrq.qb.ph  t4, t6, t4         // H G F E
+    precrq.qb.ph  s4, s6, s4         // h g f e
+    addu.qb       s0, s0, s8
+    addu.qb       s4, s4, s8
+    sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
+    sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
+    lw            a3, -4(a1)
+    addu.qb       t0, t0, s8
+    addu          a3, a3, a2
+    addu.qb       t4, t4, s8
+    sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
+    bne           a0, t9, 0b
+     sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E
+
+2:
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_rows_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
+/*
+ * a0     - data
+ */
+
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lui       t0, 6437
+    ori       t0, 2260
+    lui       t1, 9633
+    ori       t1, 11363
+    lui       t2, 0xd39e
+    ori       t2, 0xe6dc
+    lui       t3, 0xf72d
+    ori       t3, 9633
+    lui       t4, 2261
+    ori       t4, 9633
+    lui       t5, 0xd39e
+    ori       t5, 6437
+    lui       t6, 9633
+    ori       t6, 0xd39d
+    lui       t7, 0xe6dc
+    ori       t7, 2260
+    lui       t8, 4433
+    ori       t8, 10703
+    lui       t9, 0xd630
+    ori       t9, 4433
+    li        s8, 8
+    move      a1, a0
+1:
+    lw        s0, 0(a1)     // tmp0 = 1|0
+    lw        s1, 4(a1)     // tmp1 = 3|2
+    lw        s2, 8(a1)     // tmp2 = 5|4
+    lw        s3, 12(a1)    // tmp3 = 7|6
+    packrl.ph s1, s1, s1    // tmp1 = 2|3
+    packrl.ph s3, s3, s3    // tmp3 = 6|7
+    subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
+    subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
+    mult      $0, $0        // ac0  = 0
+    dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
+    dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
+    mult      $ac1, $0, $0  // ac1  = 0
+    dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
+    dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
+    mult      $ac2, $0, $0  // ac2  = 0
+    dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
+    dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
+    mult      $ac3, $0, $0  // ac3  = 0
+    dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
+    dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
+    addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
+    addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
+    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
+    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
+    extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
+    extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
+    addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
+    subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
+    sh        s0, 2(a1)
+    sh        s1, 6(a1)
+    sh        s2, 10(a1)
+    sh        s3, 14(a1)
+    mult      $0, $0        // ac0  = 0
+    dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
+    mult      $ac1, $0, $0  // ac1  = 0
+    dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
+    sra       s4, s5, 16    // tmp4 = t11
+    addiu     a1, a1, 16
+    addiu     s8, s8, -1
+    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
+    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
+    addu      s2, s5, s4    // tmp2 = t10 + t11
+    subu      s3, s5, s4    // tmp3 = t10 - t11
+    sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
+    sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
+    sh        s2, -16(a1)
+    sh        s3, -8(a1)
+    sh        s0, -12(a1)
+    bgtz      s8, 1b
+     sh       s1, -4(a1)
+    li        t0, 2260
+    li        t1, 11363
+    li        t2, 9633
+    li        t3, 6436
+    li        t4, 6437
+    li        t5, 2261
+    li        t6, 11362
+    li        t7, 2259
+    li        t8, 4433
+    li        t9, 10703
+    li        a1, 10704
+    li        s8, 8
+
+2:
+    lh        a2, 0(a0)     // 0
+    lh        a3, 16(a0)    // 8
+    lh        v0, 32(a0)    // 16
+    lh        v1, 48(a0)    // 24
+    lh        s4, 64(a0)    // 32
+    lh        s5, 80(a0)    // 40
+    lh        s6, 96(a0)    // 48
+    lh        s7, 112(a0)   // 56
+    addu      s2, v0, s5    // tmp2 = 16 + 40
+    subu      s5, v0, s5    // tmp5 = 16 - 40
+    addu      s3, v1, s4    // tmp3 = 24 + 32
+    subu      s4, v1, s4    // tmp4 = 24 - 32
+    addu      s0, a2, s7    // tmp0 =  0 + 56
+    subu      s7, a2, s7    // tmp7 =  0 - 56
+    addu      s1, a3, s6    // tmp1 =  8 + 48
+    subu      s6, a3, s6    // tmp6 =  8 - 48
+    addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
+    subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
+    addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
+    subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
+    mult      s7, t1        // ac0  = tmp7 * c1
+    madd      s4, t0        // ac0 += tmp4 * c0
+    madd      s5, t4        // ac0 += tmp5 * c4
+    madd      s6, t2        // ac0 += tmp6 * c2
+    mult      $ac1, s7, t2  // ac1  = tmp7 * c2
+    msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
+    msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
+    msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
+    mult      $ac2, s7, t4  // ac2  = tmp7 * c4
+    madd      $ac2, s4, t2  // ac2 += tmp4 * c2
+    madd      $ac2, s5, t5  // ac2 += tmp5 * c5
+    msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
+    mult      $ac3, s7, t0  // ac3  = tmp7 * c0
+    msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
+    madd      $ac3, s5, t2  // ac3 += tmp5 * c2
+    msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
+    extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
+    extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
+    extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
+    extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
+    addiu     s8, s8, -1
+    addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
+    subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
+    sh        s0, 16(a0)
+    sh        s1, 48(a0)
+    sh        s2, 80(a0)
+    sh        s3, 112(a0)
+    mult      v0, t8        // ac0  = tmp12 * c8
+    madd      v1, t9        // ac0 += tmp13 * c9
+    mult      $ac1, v1, t8  // ac1  = tmp13 * c8
+    msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
+    addiu     a0, a0, 2
+    extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
+    extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
+    shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
+    shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
+    sh        s4, -2(a0)
+    sh        s5, 62(a0)
+    sh        s6, 30(a0)
+    bgtz      s8, 2b
+     sh       s7, 94(a0)
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    jr       ra
+     nop
+
+END(jsimd_fdct_islow_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
+/*
+ * a0     - data
+ */
+    .set at
+    SAVE_REGS_ON_STACK 8, s0, s1
+    li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
+    li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
+    li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
+    li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
+
+    move         v0, a0
+    addiu        v1, v0, 128     // end address
+
+0:
+    lw           t0, 0(v0)       // tmp0 = 1|0
+    lw           t1, 4(v0)       // tmp1 = 3|2
+    lw           t2, 8(v0)       // tmp2 = 5|4
+    lw           t3, 12(v0)      // tmp3 = 7|6
+    packrl.ph    t1, t1, t1      // tmp1 = 2|3
+    packrl.ph    t3, t3, t3      // tmp3 = 6|7
+    subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
+    subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
+    addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
+    addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
+    addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
+    subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
+    sra          t4, t8, 16      // tmp4 = t11
+    mult         $0, $0          // ac0  = 0
+    dpa.w.ph     $ac0, t9, s1
+    mult         $ac1, $0, $0    // ac1  = 0
+    dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
+    dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
+    mult         $ac2, $0, $0    // ac2  = 0
+    dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
+    mult         $ac3, $0, $0    // ac3  = 0
+    dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
+    precrq.ph.w  t0, t5, t7      // t0 = t5|t6
+    addq.ph      t2, t8, t4      // tmp2 = t10 + t11
+    subq.ph      t3, t8, t4      // tmp3 = t10 - t11
+    extr.w       t4, $ac0, 8
+    mult         $0, $0          // ac0  = 0
+    dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
+    extr.w       t0, $ac1, 8     // t0 = z5
+    extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
+    extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
+    extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
+    add          t6, t1, t0      // t6 = z2
+    add          t7, t7, t0      // t7 = z4
+    subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
+    addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
+    addq.ph      t1, t0, t6      // t1 = z13 + z2
+    subq.ph      t6, t0, t6      // t6 = z13 - z2
+    addq.ph      t0, t8, t7      // t0 = z11 + z4
+    subq.ph      t7, t8, t7      // t7 = z11 - z4
+    addq.ph      t5, t4, t9
+    subq.ph      t4, t9, t4
+    sh           t2, 0(v0)
+    sh           t5, 4(v0)
+    sh           t3, 8(v0)
+    sh           t4, 12(v0)
+    sh           t1, 10(v0)
+    sh           t6, 6(v0)
+    sh           t0, 2(v0)
+    sh           t7, 14(v0)
+    addiu        v0, 16
+    bne          v1, v0, 0b
+     nop
+    move         v0, a0
+    addiu        v1, v0, 16
+
+1:
+    lh           t0, 0(v0)       // 0
+    lh           t1, 16(v0)      // 8
+    lh           t2, 32(v0)      // 16
+    lh           t3, 48(v0)      // 24
+    lh           t4, 64(v0)      // 32
+    lh           t5, 80(v0)      // 40
+    lh           t6, 96(v0)      // 48
+    lh           t7, 112(v0)     // 56
+    add          t8, t0, t7      // t8 = tmp0
+    sub          t7, t0, t7      // t7 = tmp7
+    add          t0, t1, t6      // t0 = tmp1
+    sub          t1, t1, t6      // t1 = tmp6
+    add          t6, t2, t5      // t6 = tmp2
+    sub          t5, t2, t5      // t5 = tmp5
+    add          t2, t3, t4      // t2 = tmp3
+    sub          t3, t3, t4      // t3 = tmp4
+    add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
+    sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
+    sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
+    ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
+    add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
+    mult         $0, $0          // ac0  = 0
+    dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
+    add          s0, t4, t2      // t8 = tmp10+tmp11
+    sub          t4, t4, t2      // t4 = tmp10-tmp11
+    sh           s0, 0(v0)
+    sh           t4, 64(v0)
+    extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
+    addq.ph      t4, t8, t2      // t9 = tmp13 + z1
+    subq.ph      t8, t8, t2      // t2 = tmp13 - z1
+    sh           t4, 32(v0)
+    sh           t8, 96(v0)
+    add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
+    add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
+    add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
+    andi         t4, a1, 0xffff
+    mul          s0, t1, t4
+    sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
+    ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
+    mult         $0, $0          // ac0  = 0
+    mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
+    extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
+    add          t2, t7, t8      // t2 = tmp7 + z5
+    sub          t7, t7, t8      // t7 = tmp7 - z5
+    andi         t4, a2, 0xffff
+    mul          t8, t3, t4
+    sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
+    andi         t4, s1, 0xffff
+    mul          t6, t0, t4
+    sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
+    add          t0, t6, t8      // t0 = z3 + z2
+    sub          t1, t6, t8      // t1 = z3 - z2
+    add          t3, t6, s0      // t3 = z3 + z4
+    sub          t4, t6, s0      // t4 = z3 - z4
+    sub          t5, t2, t1      // t5 = dataptr[5]
+    sub          t6, t7, t0      // t6 = dataptr[3]
+    add          t3, t2, t3      // t3 = dataptr[1]
+    add          t4, t7, t4      // t4 = dataptr[7]
+    sh           t5, 80(v0)
+    sh           t6, 48(v0)
+    sh           t3, 16(v0)
+    sh           t4, 112(v0)
+    addiu        v0, 2
+    bne          v0, v1, 1b
+     nop
+
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j            ra
+     nop
+END(jsimd_fdct_ifast_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
+/*
+ * a0     - coef_block
+ * a1     - divisors
+ * a2     - workspace
+ */
+
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+    addiu   v0, a2, 124  // v0 = workspace_end
+    lh      t0, 0(a2)
+    lh      t1, 0(a1)
+    lh      t2, 128(a1)
+    sra     t3, t0, 15
+    sll     t3, t3, 1
+    addiu   t3, t3, 1
+    mul     t0, t0, t3
+    lh      t4, 384(a1)
+    lh      t5, 130(a1)
+    lh      t6, 2(a2)
+    lh      t7, 2(a1)
+    lh      t8, 386(a1)
+
+1:
+    andi    t1, 0xffff
+    add     t9, t0, t2
+    andi    t9, 0xffff
+    mul     v1, t9, t1
+    sra     s0, t6, 15
+    sll     s0, s0, 1
+    addiu   s0, s0, 1
+    addiu   t9, t4, 16
+    srav    v1, v1, t9
+    mul     v1, v1, t3
+    mul     t6, t6, s0
+    andi    t7, 0xffff
+    addiu   a2, a2, 4
+    addiu   a1, a1, 4
+    add     s1, t6, t5
+    andi    s1, 0xffff
+    sh      v1, 0(a0)
+
+    mul     s2, s1, t7
+    addiu   s1, t8, 16
+    srav    s2, s2, s1
+    mul     s2,s2, s0
+    lh      t0, 0(a2)
+    lh      t1, 0(a1)
+    sra     t3, t0, 15
+    sll     t3, t3, 1
+    addiu   t3, t3, 1
+    mul     t0, t0, t3
+    lh      t2, 128(a1)
+    lh      t4, 384(a1)
+    lh      t5, 130(a1)
+    lh      t8, 386(a1)
+    lh      t6, 2(a2)
+    lh      t7, 2(a1)
+    sh      s2, 2(a0)
+    lh      t0, 0(a2)
+    sra     t3, t0, 15
+    sll     t3, t3, 1
+    addiu   t3, t3, 1
+    mul     t0, t0,t3
+    bne     a2, v0, 1b
+     addiu  a0, a0, 4
+
+    andi    t1, 0xffff
+    add     t9, t0, t2
+    andi    t9, 0xffff
+    mul     v1, t9, t1
+    sra     s0, t6, 15
+    sll     s0, s0, 1
+    addiu   s0, s0, 1
+    addiu   t9, t4, 16
+    srav    v1, v1, t9
+    mul     v1, v1, t3
+    mul     t6, t6, s0
+    andi    t7, 0xffff
+    sh      v1, 0(a0)
+    add     s1, t6, t5
+    andi    s1, 0xffff
+    mul     s2, s1, t7
+    addiu   s1, t8, 16
+    addiu   a2, a2, 4
+    addiu   a1, a1, 4
+    srav    s2, s2, s1
+    mul     s2, s2, s0
+    sh      s2, 2(a0)
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+    j       ra
+     nop
+
+END(jsimd_quantize_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
+/*
+ * a0     - coef_block
+ * a1     - divisors
+ * a2     - workspace
+ */
+
+    .set at
+
+    li         t1, 0x46800100     //integer representation 16384.5
+    mtc1       t1, f0
+    li         t0, 63
+0:
+    lwc1       f1, 0(a2)
+    lwc1       f5, 0(a1)
+    lwc1       f2, 4(a2)
+    lwc1       f6, 4(a1)
+    lwc1       f3, 8(a2)
+    lwc1       f7, 8(a1)
+    lwc1       f4, 12(a2)
+    lwc1       f8, 12(a1)
+    madd.s     f1, f0, f1, f5
+    madd.s     f2, f0, f2, f6
+    madd.s     f3, f0, f3, f7
+    madd.s     f4, f0, f4, f8
+    lwc1       f5, 16(a1)
+    lwc1       f6, 20(a1)
+    trunc.w.s  f1, f1
+    trunc.w.s  f2, f2
+    trunc.w.s  f3, f3
+    trunc.w.s  f4, f4
+    lwc1       f7, 24(a1)
+    lwc1       f8, 28(a1)
+    mfc1       t1, f1
+    mfc1       t2, f2
+    mfc1       t3, f3
+    mfc1       t4, f4
+    lwc1       f1, 16(a2)
+    lwc1       f2, 20(a2)
+    lwc1       f3, 24(a2)
+    lwc1       f4, 28(a2)
+    madd.s     f1, f0, f1, f5
+    madd.s     f2, f0, f2, f6
+    madd.s     f3, f0, f3, f7
+    madd.s     f4, f0, f4, f8
+    addiu      t1, t1, -16384
+    addiu      t2, t2, -16384
+    addiu      t3, t3, -16384
+    addiu      t4, t4, -16384
+    trunc.w.s  f1, f1
+    trunc.w.s  f2, f2
+    trunc.w.s  f3, f3
+    trunc.w.s  f4, f4
+    sh         t1, 0(a0)
+    sh         t2, 2(a0)
+    sh         t3, 4(a0)
+    sh         t4, 6(a0)
+    mfc1       t1, f1
+    mfc1       t2, f2
+    mfc1       t3, f3
+    mfc1       t4, f4
+    addiu      t0, t0, -8
+    addiu      a2, a2, 32
+    addiu      a1, a1, 32
+    addiu      t1, t1, -16384
+    addiu      t2, t2, -16384
+    addiu      t3, t3, -16384
+    addiu      t4, t4, -16384
+    sh         t1, 8(a0)
+    sh         t2, 10(a0)
+    sh         t3, 12(a0)
+    sh         t4, 14(a0)
+    bgez       t0, 0b
+     addiu     a0, a0, 16
+
+    j          ra
+     nop
+
+END(jsimd_quantize_float_mips_dspr2)
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
+/*
+ * a0     - compptr->dct_table
+ * a1     - coef_block
+ * a2     - output_buf
+ * a3     - output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    addiu     sp, sp, -40
+    move      v0, sp
+    addiu     s2, zero, 29692
+    addiu     s3, zero, -10426
+    addiu     s4, zero, 6967
+    addiu     s5, zero, -5906
+    lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
+    lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
+    lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
+    lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
+    mul       t4, t5, t0
+    lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
+    lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
+    mul       t6, t6, t1
+    mul       t5, t5, t0
+    lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
+    lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
+    lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
+    lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
+    mul       t7, t7, t2
+    mult      zero, zero
+    mul       t8, t8, t3
+    li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
+    li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
+    ins       t6, t5, 16, 16    // t6 = t5|t6
+    sll       t4, t4, 15
+    dpa.w.ph  $ac0, t6, s0
+    lh        t1, 2(a1)
+    lh        t6, 2(a0)
+    ins       t8, t7, 16, 16    // t8 = t7|t8
+    dpa.w.ph  $ac0, t8, s1
+    mflo      t0, $ac0
+    mul       t5, t6, t1
+    lh        t1, 18(a1)
+    lh        t6, 18(a0)
+    lh        t2, 50(a1)
+    lh        t7, 50(a0)
+    mul       t6, t6, t1
+    subu      t8, t4, t0
+    mul       t7, t7, t2
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    lh        t1, 82(a1)
+    lh        t2, 82(a0)
+    lh        t3, 114(a1)
+    lh        t4, 114(a0)
+    shra_r.w  t8, t8, 13
+    mul       t1, t1, t2
+    mul       t3, t3, t4
+    sw        t0, 0(v0)
+    sw        t8, 20(v0)
+    sll       t4, t5, 15
+    ins       t7, t6, 16, 16
+    mult      zero, zero
+    dpa.w.ph  $ac0, t7, s0
+    ins       t3, t1, 16, 16
+    lh        t1, 6(a1)
+    lh        t6, 6(a0)
+    dpa.w.ph  $ac0, t3, s1
+    mflo      t0, $ac0
+    mul       t5, t6, t1
+    lh        t1, 22(a1)
+    lh        t6, 22(a0)
+    lh        t2, 54(a1)
+    lh        t7, 54(a0)
+    mul       t6, t6, t1
+    subu      t8, t4, t0
+    mul       t7, t7, t2
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    lh        t1, 86(a1)
+    lh        t2, 86(a0)
+    lh        t3, 118(a1)
+    lh        t4, 118(a0)
+    shra_r.w  t8, t8, 13
+    mul       t1, t1, t2
+    mul       t3, t3, t4
+    sw        t0, 4(v0)
+    sw        t8, 24(v0)
+    sll       t4, t5, 15
+    ins       t7, t6, 16, 16
+    mult      zero, zero
+    dpa.w.ph  $ac0, t7, s0
+    ins       t3, t1, 16, 16
+    lh        t1, 10(a1)
+    lh        t6, 10(a0)
+    dpa.w.ph  $ac0, t3, s1
+    mflo      t0, $ac0
+    mul       t5, t6, t1
+    lh        t1, 26(a1)
+    lh        t6, 26(a0)
+    lh        t2, 58(a1)
+    lh        t7, 58(a0)
+    mul       t6, t6, t1
+    subu      t8, t4, t0
+    mul       t7, t7, t2
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    lh        t1, 90(a1)
+    lh        t2, 90(a0)
+    lh        t3, 122(a1)
+    lh        t4, 122(a0)
+    shra_r.w  t8, t8, 13
+    mul       t1, t1, t2
+    mul       t3, t3, t4
+    sw        t0, 8(v0)
+    sw        t8, 28(v0)
+    sll       t4, t5, 15
+    ins       t7, t6, 16, 16
+    mult      zero, zero
+    dpa.w.ph  $ac0, t7, s0
+    ins       t3, t1, 16, 16
+    lh        t1, 14(a1)
+    lh        t6, 14(a0)
+    dpa.w.ph  $ac0, t3, s1
+    mflo      t0, $ac0
+    mul       t5, t6, t1
+    lh        t1, 30(a1)
+    lh        t6, 30(a0)
+    lh        t2, 62(a1)
+    lh        t7, 62(a0)
+    mul       t6, t6, t1
+    subu      t8, t4, t0
+    mul       t7, t7, t2
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    lh        t1, 94(a1)
+    lh        t2, 94(a0)
+    lh        t3, 126(a1)
+    lh        t4, 126(a0)
+    shra_r.w  t8, t8, 13
+    mul       t1, t1, t2
+    mul       t3, t3, t4
+    sw        t0, 12(v0)
+    sw        t8, 32(v0)
+    sll       t4, t5, 15
+    ins       t7, t6, 16, 16
+    mult      zero, zero
+    dpa.w.ph  $ac0, t7, s0
+    ins       t3, t1, 16, 16
+    dpa.w.ph  $ac0, t3, s1
+    mflo      t0, $ac0
+    lw        t9, 0(a2)
+    lw        t3, 0(v0)
+    lw        t7, 4(v0)
+    lw        t1, 8(v0)
+    addu      t9, t9, a3
+    sll       t3, t3, 15
+    subu      t8, t4, t0
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    shra_r.w  t8, t8, 13
+    sw        t0, 16(v0)
+    sw        t8, 36(v0)
+    lw        t5, 12(v0)
+    lw        t6, 16(v0)
+    mult      t7, s2
+    madd      t1, s3
+    madd      t5, s4
+    madd      t6, s5
+    lw        t5, 24(v0)
+    lw        t7, 28(v0)
+    mflo      t0, $ac0
+    lw        t8, 32(v0)
+    lw        t2, 36(v0)
+    mult      $ac1, t5, s2
+    madd      $ac1, t7, s3
+    madd      $ac1, t8, s4
+    madd      $ac1, t2, s5
+    addu      t1, t3, t0
+    subu      t6, t3, t0
+    shra_r.w  t1, t1, 20
+    shra_r.w  t6, t6, 20
+    mflo      t4, $ac1
+    shll_s.w  t1, t1, 24
+    shll_s.w  t6, t6, 24
+    sra       t1, t1, 24
+    sra       t6, t6, 24
+    addiu     t1, t1, 128
+    addiu     t6, t6, 128
+    lw        t0, 20(v0)
+    sb        t1, 0(t9)
+    sb        t6, 1(t9)
+    sll       t0, t0, 15
+    lw        t9, 4(a2)
+    addu      t1, t0, t4
+    subu      t6, t0, t4
+    addu      t9, t9, a3
+    shra_r.w  t1, t1, 20
+    shra_r.w  t6, t6, 20
+    shll_s.w  t1, t1, 24
+    shll_s.w  t6, t6, 24
+    sra       t1, t1, 24
+    sra       t6, t6, 24
+    addiu     t1, t1, 128
+    addiu     t6, t6, 128
+    sb        t1, 0(t9)
+    sb        t6, 1(t9)
+    addiu     sp, sp, 40
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j         ra
+     nop
+
+END(jsimd_idct_2x2_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
+/*
+ * a0     - compptr->dct_table
+ * a1     - coef_block
+ * a2     - output_buf
+ * a3     - output_col
+ * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
+ */
+
+    .set at
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw        v1, 48(sp)
+    move      t0, a1
+    move      t1, v1
+    li        t9, 4
+    li        s0, 0x2e75f93e
+    li        s1, 0x21f9ba79
+    li        s2, 0xecc2efb0
+    li        s3, 0x52031ccd
+
+0:
+    lh        s6, 32(t0)        // inptr[DCTSIZE*2]
+    lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
+    lh        s7, 96(t0)        // inptr[DCTSIZE*6]
+    lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
+    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh        s4, 0(t0)         // inptr[DCTSIZE*0]
+    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh        s5, 0(a0)         // quantptr[0]
+    li        s6, 15137
+    li        s7, 6270
+    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
+    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh        t5, 112(t0)       // inptr[DCTSIZE*7]
+    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
+    lh        v0, 80(t0)        // inptr[DCTSIZE*5]
+    lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
+    lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
+    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
+    lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
+    lh        t8, 16(t0)        // inptr[DCTSIZE*1]
+    subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
+    lh        t7, 48(t0)        // inptr[DCTSIZE*3]
+    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
+    mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
+    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
+    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
+    addu      t3, t2, t6        // tmp10 = tmp0 + z2
+    subu      t4, t2, t6        // tmp10 = tmp0 - z2
+    mult      $ac0, zero, zero
+    mult      $ac1, zero, zero
+    ins       t5, v0, 16, 16
+    ins       t7, t8, 16, 16
+    addiu     t9, t9, -1
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    mflo      s4, $ac0
+    mflo      s5, $ac1
+    addiu     a0, a0, 2
+    addiu     t1, t1, 4
+    addiu     t0, t0, 2
+    addu      t6, t4, s4
+    subu      t5, t4, s4
+    addu      s6, t3, s5
+    subu      s7, t3, s5
+    shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
+    shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
+    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
+    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
+    sw        t6, 28(t1)
+    sw        t5, 60(t1)
+    sw        s6, -4(t1)
+    bgtz      t9, 0b
+     sw       s7, 92(t1)
+    // second loop three pass
+    li        t9, 3
+1:
+    lh        s6, 34(t0)        // inptr[DCTSIZE*2]
+    lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
+    lh        s7, 98(t0)        // inptr[DCTSIZE*6]
+    lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
+    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh        s4, 2(t0)         // inptr[DCTSIZE*0]
+    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
+    li        s6, 15137
+    li        s7, 6270
+    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
+    mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh        t5, 114(t0)       // inptr[DCTSIZE*7]
+    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
+    lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
+    lh        t6, 82(t0)        // inptr[DCTSIZE*5]
+    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
+    lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
+    lh        t8, 18(t0)        // inptr[DCTSIZE*1]
+    subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
+    lh        t7, 50(t0)        // inptr[DCTSIZE*3]
+    lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
+    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
+    mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
+    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
+    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
+    addu      t3, t2, v0        // tmp10 = tmp0 + z2
+    subu      t4, t2, v0        // tmp10 = tmp0 - z2
+    mult      $ac0, zero, zero
+    mult      $ac1, zero, zero
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    mflo      t5, $ac0
+    mflo      t6, $ac1
+    addiu     t9, t9, -1
+    addiu     t0, t0, 2
+    addiu     a0, a0, 2
+    addiu     t1, t1, 4
+    addu      s5, t4, t5
+    subu      s4, t4, t5
+    addu      s6, t3, t6
+    subu      s7, t3, t6
+    shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
+    shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
+    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
+    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
+    sw        s5, 32(t1)
+    sw        s4, 64(t1)
+    sw        s6, 0(t1)
+    bgtz      t9, 1b
+     sw       s7, 96(t1)
+    move      t1, v1
+    li        s4, 15137
+    lw        s6, 8(t1)         // wsptr[2]
+    li        s5, 6270
+    lw        s7, 24(t1)        // wsptr[6]
+    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
+    lw        t2, 0(t1)         // wsptr[0]
+    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
+    lh        t5, 28(t1)        // wsptr[7]
+    lh        t6, 20(t1)        // wsptr[5]
+    lh        t7, 12(t1)        // wsptr[3]
+    lh        t8, 4(t1)         // wsptr[1]
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    mult      $ac0, zero, zero
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    mult      $ac1, zero, zero
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
+    mflo      s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu      s4, s4, s5
+    addu      t3, t2, s4        // tmp10 = tmp0 + z2
+    mflo      s7, $ac1
+    subu      t4, t2, s4        // tmp10 = tmp0 - z2
+    addu      t7, t4, s6
+    subu      t8, t4, s6
+    addu      t5, t3, s7
+    subu      t6, t3, s7
+    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
+    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
+    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
+    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
+    sll       s4, t9, 2
+    lw        v0, 0(a2)         // output_buf[ctr]
+    shll_s.w  t5, t5, 24
+    shll_s.w  t6, t6, 24
+    shll_s.w  t7, t7, 24
+    shll_s.w  t8, t8, 24
+    sra       t5, t5, 24
+    sra       t6, t6, 24
+    sra       t7, t7, 24
+    sra       t8, t8, 24
+    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
+    addiu     t5, t5, 128
+    addiu     t6, t6, 128
+    addiu     t7, t7, 128
+    addiu     t8, t8, 128
+    sb        t5, 0(v0)
+    sb        t7, 1(v0)
+    sb        t8, 2(v0)
+    sb        t6, 3(v0)
+    // 2
+    li        s4, 15137
+    lw        s6, 40(t1)        // wsptr[2]
+    li        s5, 6270
+    lw        s7, 56(t1)        // wsptr[6]
+    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
+    lw        t2, 32(t1)        // wsptr[0]
+    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
+    lh        t5, 60(t1)        // wsptr[7]
+    lh        t6, 52(t1)        // wsptr[5]
+    lh        t7, 44(t1)        // wsptr[3]
+    lh        t8, 36(t1)        // wsptr[1]
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    mult      $ac0, zero, zero
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    mult      $ac1, zero, zero
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
+    mflo      s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu      s4, s4, s5
+    addu      t3, t2, s4        // tmp10 = tmp0 + z2
+    mflo      s7, $ac1
+    subu      t4, t2, s4        // tmp10 = tmp0 - z2
+    addu      t7, t4, s6
+    subu      t8, t4, s6
+    addu      t5, t3, s7
+    subu      t6, t3, s7
+    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
+    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
+    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
+    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
+    sll       s4, t9, 2
+    lw        v0, 4(a2)         // output_buf[ctr]
+    shll_s.w  t5, t5, 24
+    shll_s.w  t6, t6, 24
+    shll_s.w  t7, t7, 24
+    shll_s.w  t8, t8, 24
+    sra       t5, t5, 24
+    sra       t6, t6, 24
+    sra       t7, t7, 24
+    sra       t8, t8, 24
+    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
+    addiu     t5, t5, 128
+    addiu     t6, t6, 128
+    addiu     t7, t7, 128
+    addiu     t8, t8, 128
+    sb        t5, 0(v0)
+    sb        t7, 1(v0)
+    sb        t8, 2(v0)
+    sb        t6, 3(v0)
+    // 3
+    li        s4, 15137
+    lw        s6, 72(t1)        // wsptr[2]
+    li        s5, 6270
+    lw        s7, 88(t1)        // wsptr[6]
+    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
+    lw        t2, 64(t1)        // wsptr[0]
+    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
+    lh        t5, 92(t1)        // wsptr[7]
+    lh        t6, 84(t1)        // wsptr[5]
+    lh        t7, 76(t1)        // wsptr[3]
+    lh        t8, 68(t1)        // wsptr[1]
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    mult      $ac0, zero, zero
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    mult      $ac1, zero, zero
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
+    mflo      s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu      s4, s4, s5
+    addu      t3, t2, s4        // tmp10 = tmp0 + z2
+    mflo      s7, $ac1
+    subu      t4, t2, s4        // tmp10 = tmp0 - z2
+    addu      t7, t4, s6
+    subu      t8, t4, s6
+    addu      t5, t3, s7
+    subu      t6, t3, s7
+    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
+    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
+    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
+    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
+    sll       s4, t9, 2
+    lw        v0, 8(a2)         // output_buf[ctr]
+    shll_s.w  t5, t5, 24
+    shll_s.w  t6, t6, 24
+    shll_s.w  t7, t7, 24
+    shll_s.w  t8, t8, 24
+    sra       t5, t5, 24
+    sra       t6, t6, 24
+    sra       t7, t7, 24
+    sra       t8, t8, 24
+    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
+    addiu     t5, t5, 128
+    addiu     t6, t6, 128
+    addiu     t7, t7, 128
+    addiu     t8, t8, 128
+    sb        t5, 0(v0)
+    sb        t7, 1(v0)
+    sb        t8, 2(v0)
+    sb        t6, 3(v0)
+    li        s4, 15137
+    lw        s6, 104(t1)       // wsptr[2]
+    li        s5, 6270
+    lw        s7, 120(t1)       // wsptr[6]
+    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
+    lw        t2, 96(t1)        // wsptr[0]
+    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], -FIX_0_765366865)
+    lh        t5, 124(t1)       // wsptr[7]
+    lh        t6, 116(t1)       // wsptr[5]
+    lh        t7, 108(t1)       // wsptr[3]
+    lh        t8, 100(t1)       // wsptr[1]
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    mult      $ac0, zero, zero
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    mult      $ac1, zero, zero
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
+    mflo      s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu      s4, s4, s5
+    addu      t3, t2, s4        // tmp10 = tmp0 + z2;
+    mflo      s7, $ac1
+    subu      t4, t2, s4        // tmp10 = tmp0 - z2;
+    addu      t7, t4, s6
+    subu      t8, t4, s6
+    addu      t5, t3, s7
+    subu      t6, t3, s7
+    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
+    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
+    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
+    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
+    sll       s4, t9, 2
+    lw        v0, 12(a2)        // output_buf[ctr]
+    shll_s.w  t5, t5, 24
+    shll_s.w  t6, t6, 24
+    shll_s.w  t7, t7, 24
+    shll_s.w  t8, t8, 24
+    sra       t5, t5, 24
+    sra       t6, t6, 24
+    sra       t7, t7, 24
+    sra       t8, t8, 24
+    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
+    addiu     t5, t5, 128
+    addiu     t6, t6, 128
+    addiu     t7, t7, 128
+    addiu     t8, t8, 128
+    sb        t5, 0(v0)
+    sb        t7, 1(v0)
+    sb        t8, 2(v0)
+    sb        t6, 3(v0)
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j         ra
+     nop
+END(jsimd_idct_4x4_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
+/*
+ * a0     - compptr->dct_table
+ * a1     - coef_block
+ * a2     - output_buf
+ * a3     - output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu     sp, sp, -144
+    move      v0, sp
+    addiu     v1, v0, 24
+    addiu     t9, zero, 5793
+    addiu     s0, zero, 10033
+    addiu     s1, zero, 2998
+
+1:
+    lh        s2, 0(a0)   // q0 = quantptr[ 0]
+    lh        s3, 32(a0)  // q1 = quantptr[16]
+    lh        s4, 64(a0)  // q2 = quantptr[32]
+    lh        t2, 64(a1)  // tmp2 = inptr[32]
+    lh        t1, 32(a1)  // tmp1 = inptr[16]
+    lh        t0, 0(a1)   // tmp0 = inptr[ 0]
+    mul       t2, t2, s4  // tmp2 = tmp2 * q2
+    mul       t1, t1, s3  // tmp1 = tmp1 * q1
+    mul       t0, t0, s2  // tmp0 = tmp0 * q0
+    lh        t6, 16(a1)  // z1 = inptr[ 8]
+    lh        t8, 80(a1)  // z3 = inptr[40]
+    lh        t7, 48(a1)  // z2 = inptr[24]
+    lh        s2, 16(a0)  // q0 = quantptr[ 8]
+    lh        s4, 80(a0)  // q2 = quantptr[40]
+    lh        s3, 48(a0)  // q1 = quantptr[24]
+    mul       t2, t2, t9  // tmp2 = tmp2 * 5793
+    mul       t1, t1, s0  // tmp1 = tmp1 * 10033
+    sll       t0, t0, 13  // tmp0 = tmp0 << 13
+    mul       t6, t6, s2  // z1 = z1 * q0
+    mul       t8, t8, s4  // z3 = z3 * q2
+    mul       t7, t7, s3  // z2 = z2 * q1
+    addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
+    sll       t2, t2, 1   // tmp2 = tmp2 << 2
+    subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
+    subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
+    addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
+    addu      t1, t6, t8  // tmp1 = z1 + z3
+    mul       t1, t1, s1  // tmp1 = tmp1 * 2998
+    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
+    subu      t2, t6, t8  // tmp2 = z1 - z3
+    subu      t2, t2, t7  // tmp2 = tmp2 - z2
+    sll       t2, t2, 2   // tmp2 = tmp2 << 2
+    addu      t0, t6, t7  // tmp0 = z1 + z2
+    sll       t0, t0, 13  // tmp0 = tmp0 << 13
+    subu      s2, t8, t7  // q0 = z3 - z2
+    sll       s2, s2, 13  // q0 = q0 << 13
+    addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
+    addu      t1, s2, t1  // tmp1 = q0 + tmp1
+    addu      s2, t4, t2  // q0 = tmp11 + tmp2
+    subu      s3, t4, t2  // q1 = tmp11 - tmp2
+    addu      t6, t3, t0  // z1 = tmp10 + tmp0
+    subu      t7, t3, t0  // z2 = tmp10 - tmp0
+    addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
+    subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
+    shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
+    shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
+    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
+    shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
+    sw        s2, 24(v0)
+    sw        s3, 96(v0)
+    sw        t6, 0(v0)
+    sw        t7, 120(v0)
+    sw        t4, 48(v0)
+    sw        t5, 72(v0)
+    addiu     v0, v0, 4
+    addiu     a1, a1, 2
+    bne       v0, v1, 1b
+     addiu    a0, a0, 2
+
+    /* Pass 2: process 6 rows from work array, store into output array. */
+    move      v0, sp
+    addiu     v1, v0, 144
+
+2:
+    lw        t0, 0(v0)
+    lw        t2, 16(v0)
+    lw        s5, 0(a2)
+    addiu     t0, t0, 16
+    sll       t0, t0, 13
+    mul       t3, t2, t9
+    lw        t6, 4(v0)
+    lw        t8, 20(v0)
+    lw        t7, 12(v0)
+    addu      s5, s5, a3
+    addu      s6, t6, t8
+    mul       s6, s6, s1
+    addu      t1, t0, t3
+    subu      t4, t0, t3
+    subu      t4, t4, t3
+    lw        t3, 8(v0)
+    mul       t0, t3, s0
+    addu      s7, t6, t7
+    sll       s7, s7, 13
+    addu      s7, s6, s7
+    subu      t2, t8, t7
+    sll       t2, t2, 13
+    addu      t2, s6, t2
+    subu      s6, t6, t7
+    subu      s6, s6, t8
+    sll       s6, s6, 13
+    addu      t3, t1, t0
+    subu      t5, t1, t0
+    addu      t6, t3, s7
+    subu      t3, t3, s7
+    addu      t7, t4, s6
+    subu      t4, t4, s6
+    addu      t8, t5, t2
+    subu      t5, t5, t2
+    shll_s.w  t6, t6, 6
+    shll_s.w  t3, t3, 6
+    shll_s.w  t7, t7, 6
+    shll_s.w  t4, t4, 6
+    shll_s.w  t8, t8, 6
+    shll_s.w  t5, t5, 6
+    sra       t6, t6, 24
+    addiu     t6, t6, 128
+    sra       t3, t3, 24
+    addiu     t3, t3, 128
+    sb        t6, 0(s5)
+    sra       t7, t7, 24
+    addiu     t7, t7, 128
+    sb        t3, 5(s5)
+    sra       t4, t4, 24
+    addiu     t4, t4, 128
+    sb        t7, 1(s5)
+    sra       t8, t8, 24
+    addiu     t8, t8, 128
+    sb        t4, 4(s5)
+    addiu     v0, v0, 24
+    sra       t5, t5, 24
+    addiu     t5, t5, 128
+    sb        t8, 2(s5)
+    addiu     a2, a2,  4
+    bne       v0, v1, 2b
+     sb       t5, 3(s5)
+
+    addiu     sp, sp, 144
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j         ra
+     nop
+
+END(jsimd_idct_6x6_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
+/*
+ * a0     - compptr->dct_table
+ * a1     - coef_block
+ * a2     - workspace
+ */
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li         a3, 8
+
+1:
+    // odd part
+    lh         t0, 48(a1)
+    lh         t1, 48(a0)
+    lh         t2, 16(a1)
+    lh         t3, 16(a0)
+    lh         t4, 80(a1)
+    lh         t5, 80(a0)
+    lh         t6, 112(a1)
+    lh         t7, 112(a0)
+    mul        t0, t0, t1    // z2
+    mul        t1, t2, t3    // z1
+    mul        t2, t4, t5    // z3
+    mul        t3, t6, t7    // z4
+    li         t4, 10703     // FIX(1.306562965)
+    li         t5, 4433      // FIX_0_541196100
+    li         t6, 7053      // FIX(0.860918669)
+    mul        t4, t0,t4     // tmp11
+    mul        t5, t0,t5     // -tmp14
+    addu       t7, t1,t2     // tmp10
+    addu       t8, t7,t3     // tmp10 + z4
+    mul        t6, t6, t8    // tmp15
+    li         t8, 2139      // FIX(0.261052384)
+    mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
+    li         t7, 2295      // FIX(0.280143716)
+    mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
+    addu       t9, t2, t3    // z3 + z4
+    li         s0, 8565      // FIX(1.045510580)
+    mul        t9, t9, s0    // -tmp13
+    li         s0, 12112     // FIX(1.478575242)
+    mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
+    li         s1, 12998     // FIX(1.586706681)
+    mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
+    li         s2, 5540      // FIX(0.676326758)
+    mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
+    li         s3, 16244     // FIX(1.982889723)
+    mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
+    subu       t1, t1, t3    // z1-=z4
+    subu       t0, t0, t2    // z2-=z3
+    addu       t2, t0, t1    // z1+z2
+    li         t3, 4433      // FIX_0_541196100
+    mul        t2, t2, t3    // z3
+    li         t3, 6270      // FIX_0_765366865
+    mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
+    li         t3, 15137     // FIX_0_765366865
+    mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
+    addu       t8, t6, t8    // tmp12
+    addu       t3, t8, t4    // tmp12 + tmp11
+    addu       t3, t3, t7    // tmp10
+    subu       t8, t8, t9    // tmp12 + tmp13
+    addu       s0, t5, s0
+    subu       t8, t8, s0    // tmp12
+    subu       t9, t6, t9
+    subu       s1, s1, t4
+    addu       t9, t9, s1    // tmp13
+    subu       t6, t6, t5
+    subu       t6, t6, s2
+    subu       t6, t6, s3    // tmp15
+    // even part start
+    lh         t4, 64(a1)
+    lh         t5, 64(a0)
+    lh         t7, 32(a1)
+    lh         s0, 32(a0)
+    lh         s1, 0(a1)
+    lh         s2, 0(a0)
+    lh         s3, 96(a1)
+    lh         v0, 96(a0)
+    mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
+    mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
+    mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
+    mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
+    // odd part end
+    addu       t1, t2, t1    // tmp11
+    subu       t0, t2, t0    // tmp14
+    // update counter and pointers
+    addiu      a3, a3, -1
+    addiu      a0, a0, 2
+    addiu      a1, a1, 2
+    // even part rest
+    li         s1, 10033
+    li         s2, 11190
+    mul        t4, t4, s1    // z4
+    mul        s1, t5, s2    // z4
+    sll        t5, t5, 13    // z1
+    sll        t7, t7, 13
+    addiu      t7, t7, 1024  // z3
+    sll        s0, s0, 13    // z2
+    addu       s2, t7, t4    // tmp10
+    subu       t4, t7, t4    // tmp11
+    subu       s3, t5, s0    // tmp12
+    addu       t2, t7, s3    // tmp21
+    subu       s3, t7, s3    // tmp24
+    addu       t7, s1, s0    // tmp12
+    addu       v0, s2, t7    // tmp20
+    subu       s2, s2, t7    // tmp25
+    subu       s1, s1, t5    // z4 - z1
+    subu       s1, s1, s0    // tmp12
+    addu       s0, t4, s1    // tmp22
+    subu       t4, t4, s1    // tmp23
+    // final output stage
+    addu       t5, v0, t3
+    subu       v0, v0, t3
+    addu       t3, t2, t1
+    subu       t2, t2, t1
+    addu       t1, s0, t8
+    subu       s0, s0, t8
+    addu       t8, t4, t9
+    subu       t4, t4, t9
+    addu       t9, s3, t0
+    subu       s3, s3, t0
+    addu       t0, s2, t6
+    subu       s2, s2, t6
+    sra        t5, t5, 11
+    sra        t3, t3, 11
+    sra        t1, t1, 11
+    sra        t8, t8, 11
+    sra        t9, t9, 11
+    sra        t0, t0, 11
+    sra        s2, s2, 11
+    sra        s3, s3, 11
+    sra        t4, t4, 11
+    sra        s0, s0, 11
+    sra        t2, t2, 11
+    sra        v0, v0, 11
+    sw         t5, 0(a2)
+    sw         t3, 32(a2)
+    sw         t1, 64(a2)
+    sw         t8, 96(a2)
+    sw         t9, 128(a2)
+    sw         t0, 160(a2)
+    sw         s2, 192(a2)
+    sw         s3, 224(a2)
+    sw         t4, 256(a2)
+    sw         s0, 288(a2)
+    sw         t2, 320(a2)
+    sw         v0, 352(a2)
+    bgtz       a3, 1b
+     addiu     a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j          ra
+     nop
+
+END(jsimd_idct_12x12_pass1_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
+/*
+ * a0     - workspace
+ * a1     - output
+ */
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li        a3, 12
+
+1:
+    // Odd part
+    lw        t0, 12(a0)
+    lw        t1, 4(a0)
+    lw        t2, 20(a0)
+    lw        t3, 28(a0)
+    li        t4, 10703     // FIX(1.306562965)
+    li        t5, 4433      // FIX_0_541196100
+    mul       t4, t0, t4    // tmp11
+    mul       t5, t0, t5    // -tmp14
+    addu      t6, t1, t2    // tmp10
+    li        t7, 2139      // FIX(0.261052384)
+    mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
+    addu      t6, t6, t3    // tmp10 + z4
+    li        t8, 7053      // FIX(0.860918669)
+    mul       t6, t6, t8    // tmp15
+    li        t8, 2295      // FIX(0.280143716)
+    mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
+    addu      t9, t2, t3    // z3 + z4
+    li        s0, 8565      // FIX(1.045510580)
+    mul       t9, t9, s0    // -tmp13
+    li        s0, 12112     // FIX(1.478575242)
+    mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
+    li        s1, 12998     // FIX(1.586706681)
+    mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
+    li        s2, 5540      // FIX(0.676326758)
+    mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
+    li        s3, 16244     // FIX(1.982889723)
+    mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
+    subu      t1, t1, t3    // z1 -= z4
+    subu      t0, t0, t2    // z2 -= z3
+    addu      t2, t1, t0    // z1 + z2
+    li        t3, 4433      // FIX_0_541196100
+    mul       t2, t2, t3    // z3
+    li        t3, 6270      // FIX_0_765366865
+    mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
+    li        t3, 15137     // FIX_1_847759065
+    mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
+    addu      t3, t6, t7    // tmp12
+    addu      t7, t3, t4
+    addu      t7, t7, t8    // tmp10
+    subu      t3, t3, t9
+    subu      t3, t3, t5
+    subu      t3, t3, s0    // tmp12
+    subu      t9, t6, t9
+    subu      t9, t9, t4
+    addu      t9, t9, s1    // tmp13
+    subu      t6, t6, t5
+    subu      t6, t6, s2
+    subu      t6, t6, s3    // tmp15
+    addu      t1, t2, t1    // tmp11
+    subu      t0, t2, t0    // tmp14
+    // even part
+    lw        t2, 16(a0)    // z4
+    lw        t4, 8(a0)     // z1
+    lw        t5, 0(a0)     // z3
+    lw        t8, 24(a0)    // z2
+    li        s0, 10033     // FIX(1.224744871)
+    li        s1, 11190     // FIX(1.366025404)
+    mul       t2, t2, s0    // z4
+    mul       s0, t4, s1    // z4
+    addiu     t5, t5, 0x10
+    sll       t5, t5, 13    // z3
+    sll       t4, t4, 13    // z1
+    sll       t8, t8, 13    // z2
+    subu      s1, t4, t8    // tmp12
+    addu      s2, t5, t2    // tmp10
+    subu      t2, t5, t2    // tmp11
+    addu      s3, t5, s1    // tmp21
+    subu      s1, t5, s1    // tmp24
+    addu      t5, s0, t8    // tmp12
+    addu      v0, s2, t5    // tmp20
+    subu      t5, s2, t5    // tmp25
+    subu      t4, s0, t4
+    subu      t4, t4, t8    // tmp12
+    addu      t8, t2, t4    // tmp22
+    subu      t2, t2, t4    // tmp23
+    // increment counter and pointers
+    addiu     a3, a3, -1
+    addiu     a0, a0, 32
+    // Final stage
+    addu      t4, v0, t7
+    subu      v0, v0, t7
+    addu      t7, s3, t1
+    subu      s3, s3, t1
+    addu      t1, t8, t3
+    subu      t8, t8, t3
+    addu      t3, t2, t9
+    subu      t2, t2, t9
+    addu      t9, s1, t0
+    subu      s1, s1, t0
+    addu      t0, t5, t6
+    subu      t5, t5, t6
+    sll       t4, t4, 4
+    sll       t7, t7, 4
+    sll       t1, t1, 4
+    sll       t3, t3, 4
+    sll       t9, t9, 4
+    sll       t0, t0, 4
+    sll       t5, t5, 4
+    sll       s1, s1, 4
+    sll       t2, t2, 4
+    sll       t8, t8, 4
+    sll       s3, s3, 4
+    sll       v0, v0, 4
+    shll_s.w  t4, t4, 2
+    shll_s.w  t7, t7, 2
+    shll_s.w  t1, t1, 2
+    shll_s.w  t3, t3, 2
+    shll_s.w  t9, t9, 2
+    shll_s.w  t0, t0, 2
+    shll_s.w  t5, t5, 2
+    shll_s.w  s1, s1, 2
+    shll_s.w  t2, t2, 2
+    shll_s.w  t8, t8, 2
+    shll_s.w  s3, s3, 2
+    shll_s.w  v0, v0, 2
+    srl       t4, t4, 24
+    srl       t7, t7, 24
+    srl       t1, t1, 24
+    srl       t3, t3, 24
+    srl       t9, t9, 24
+    srl       t0, t0, 24
+    srl       t5, t5, 24
+    srl       s1, s1, 24
+    srl       t2, t2, 24
+    srl       t8, t8, 24
+    srl       s3, s3, 24
+    srl       v0, v0, 24
+    lw        t6, 0(a1)
+    addiu     t4, t4, 0x80
+    addiu     t7, t7, 0x80
+    addiu     t1, t1, 0x80
+    addiu     t3, t3, 0x80
+    addiu     t9, t9, 0x80
+    addiu     t0, t0, 0x80
+    addiu     t5, t5, 0x80
+    addiu     s1, s1, 0x80
+    addiu     t2, t2, 0x80
+    addiu     t8, t8, 0x80
+    addiu     s3, s3, 0x80
+    addiu     v0, v0, 0x80
+    sb        t4, 0(t6)
+    sb        t7, 1(t6)
+    sb        t1, 2(t6)
+    sb        t3, 3(t6)
+    sb        t9, 4(t6)
+    sb        t0, 5(t6)
+    sb        t5, 6(t6)
+    sb        s1, 7(t6)
+    sb        t2, 8(t6)
+    sb        t8, 9(t6)
+    sb        s3, 10(t6)
+    sb        v0, 11(t6)
+    bgtz      a3, 1b
+     addiu    a1, a1, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    jr        ra
+     nop
+
+END(jsimd_idct_12x12_pass2_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
+/*
+ * a0     - sample_data
+ * a1     - start_col
+ * a2     - workspace
+ */
+
+    lw             t0, 0(a0)
+    li             t7, 0xff80ff80
+    addu           t0, t0, a1
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    lw             t0, 4(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 0(a2)
+    usw            t4, 4(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 8(a2)
+    usw            t6, 12(a2)
+
+    lw             t0, 8(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 16(a2)
+    usw            t4, 20(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 24(a2)
+    usw            t6, 28(a2)
+
+    lw             t0, 12(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 32(a2)
+    usw            t4, 36(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 40(a2)
+    usw            t6, 44(a2)
+
+    lw             t0, 16(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 48(a2)
+    usw            t4, 52(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 56(a2)
+    usw            t6, 60(a2)
+
+    lw             t0, 20(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 64(a2)
+    usw            t4, 68(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 72(a2)
+    usw            t6, 76(a2)
+
+    lw             t0, 24(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 80(a2)
+    usw            t4, 84(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 88(a2)
+    usw            t6, 92(a2)
+
+    lw             t0, 28(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 96(a2)
+    usw            t4, 100(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 104(a2)
+    usw            t6, 108(a2)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 112(a2)
+    usw            t4, 116(a2)
+    usw            t5, 120(a2)
+    usw            t6, 124(a2)
+
+    j              ra
+     nop
+
+END(jsimd_convsamp_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
+/*
+ * a0     - sample_data
+ * a1     - start_col
+ * a2     - workspace
+ */
+
+    .set at
+
+    lw       t0, 0(a0)
+    addu     t0, t0, a1
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f1
+    mtc1     t2, f2
+    mtc1     t3, f3
+    mtc1     t4, f4
+    mtc1     t5, f5
+    mtc1     t6, f6
+    mtc1     t7, f7
+    mtc1     t8, f8
+    cvt.s.w  f1, f1
+    cvt.s.w  f2, f2
+    cvt.s.w  f3, f3
+    cvt.s.w  f4, f4
+    cvt.s.w  f5, f5
+    cvt.s.w  f6, f6
+    cvt.s.w  f7, f7
+    cvt.s.w  f8, f8
+    lw       t0, 4(a0)
+    swc1     f1, 0(a2)
+    swc1     f2, 4(a2)
+    swc1     f3, 8(a2)
+    addu     t0, t0, a1
+    swc1     f4, 12(a2)
+    swc1     f5, 16(a2)
+    swc1     f6, 20(a2)
+    swc1     f7, 24(a2)
+    swc1     f8, 28(a2)
+    //elemr 1
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f1
+    mtc1     t2, f2
+    mtc1     t3, f3
+    mtc1     t4, f4
+    mtc1     t5, f5
+    mtc1     t6, f6
+    mtc1     t7, f7
+    mtc1     t8, f8
+    cvt.s.w  f1, f1
+    cvt.s.w  f2, f2
+    cvt.s.w  f3, f3
+    cvt.s.w  f4, f4
+    cvt.s.w  f5, f5
+    cvt.s.w  f6, f6
+    cvt.s.w  f7, f7
+    cvt.s.w  f8, f8
+    lw       t0, 8(a0)
+    swc1     f1, 32(a2)
+    swc1     f2, 36(a2)
+    swc1     f3, 40(a2)
+    addu     t0, t0, a1
+    swc1     f4, 44(a2)
+    swc1     f5, 48(a2)
+    swc1     f6, 52(a2)
+    swc1     f7, 56(a2)
+    swc1     f8, 60(a2)
+    //elemr 2
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f1
+    mtc1     t2, f2
+    mtc1     t3, f3
+    mtc1     t4, f4
+    mtc1     t5, f5
+    mtc1     t6, f6
+    mtc1     t7, f7
+    mtc1     t8, f8
+    cvt.s.w  f1, f1
+    cvt.s.w  f2, f2
+    cvt.s.w  f3, f3
+    cvt.s.w  f4, f4
+    cvt.s.w  f5, f5
+    cvt.s.w  f6, f6
+    cvt.s.w  f7, f7
+    cvt.s.w  f8, f8
+    lw       t0, 12(a0)
+    swc1     f1, 64(a2)
+    swc1     f2, 68(a2)
+    swc1     f3, 72(a2)
+    addu     t0, t0, a1
+    swc1     f4, 76(a2)
+    swc1     f5, 80(a2)
+    swc1     f6, 84(a2)
+    swc1     f7, 88(a2)
+    swc1     f8, 92(a2)
+    //elemr 3
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f1
+    mtc1     t2, f2
+    mtc1     t3, f3
+    mtc1     t4, f4
+    mtc1     t5, f5
+    mtc1     t6, f6
+    mtc1     t7, f7
+    mtc1     t8, f8
+    cvt.s.w  f1, f1
+    cvt.s.w  f2, f2
+    cvt.s.w  f3, f3
+    cvt.s.w  f4, f4
+    cvt.s.w  f5, f5
+    cvt.s.w  f6, f6
+    cvt.s.w  f7, f7
+    cvt.s.w  f8, f8
+    lw       t0, 16(a0)
+    swc1     f1, 96(a2)
+    swc1     f2, 100(a2)
+    swc1     f3, 104(a2)
+    addu     t0, t0, a1
+    swc1     f4, 108(a2)
+    swc1     f5, 112(a2)
+    swc1     f6, 116(a2)
+    swc1     f7, 120(a2)
+    swc1     f8, 124(a2)
+    //elemr 4
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f1
+    mtc1     t2, f2
+    mtc1     t3, f3
+    mtc1     t4, f4
+    mtc1     t5, f5
+    mtc1     t6, f6
+    mtc1     t7, f7
+    mtc1     t8, f8
+    cvt.s.w  f1, f1
+    cvt.s.w  f2, f2
+    cvt.s.w  f3, f3
+    cvt.s.w  f4, f4
+    cvt.s.w  f5, f5
+    cvt.s.w  f6, f6
+    cvt.s.w  f7, f7
+    cvt.s.w  f8, f8
+    lw       t0, 20(a0)
+    swc1     f1, 128(a2)
+    swc1     f2, 132(a2)
+    swc1     f3, 136(a2)
+    addu     t0, t0, a1
+    swc1     f4, 140(a2)
+    swc1     f5, 144(a2)
+    swc1     f6, 148(a2)
+    swc1     f7, 152(a2)
+    swc1     f8, 156(a2)
+    //elemr 5
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f1
+    mtc1     t2, f2
+    mtc1     t3, f3
+    mtc1     t4, f4
+    mtc1     t5, f5
+    mtc1     t6, f6
+    mtc1     t7, f7
+    mtc1     t8, f8
+    cvt.s.w  f1, f1
+    cvt.s.w  f2, f2
+    cvt.s.w  f3, f3
+    cvt.s.w  f4, f4
+    cvt.s.w  f5, f5
+    cvt.s.w  f6, f6
+    cvt.s.w  f7, f7
+    cvt.s.w  f8, f8
+    lw       t0, 24(a0)
+    swc1     f1, 160(a2)
+    swc1     f2, 164(a2)
+    swc1     f3, 168(a2)
+    addu     t0, t0, a1
+    swc1     f4, 172(a2)
+    swc1     f5, 176(a2)
+    swc1     f6, 180(a2)
+    swc1     f7, 184(a2)
+    swc1     f8, 188(a2)
+    //elemr 6
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f1
+    mtc1     t2, f2
+    mtc1     t3, f3
+    mtc1     t4, f4
+    mtc1     t5, f5
+    mtc1     t6, f6
+    mtc1     t7, f7
+    mtc1     t8, f8
+    cvt.s.w  f1, f1
+    cvt.s.w  f2, f2
+    cvt.s.w  f3, f3
+    cvt.s.w  f4, f4
+    cvt.s.w  f5, f5
+    cvt.s.w  f6, f6
+    cvt.s.w  f7, f7
+    cvt.s.w  f8, f8
+    lw       t0, 28(a0)
+    swc1     f1, 192(a2)
+    swc1     f2, 196(a2)
+    swc1     f3, 200(a2)
+    addu     t0, t0, a1
+    swc1     f4, 204(a2)
+    swc1     f5, 208(a2)
+    swc1     f6, 212(a2)
+    swc1     f7, 216(a2)
+    swc1     f8, 220(a2)
+    //elemr 7
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f1
+    mtc1     t2, f2
+    mtc1     t3, f3
+    mtc1     t4, f4
+    mtc1     t5, f5
+    mtc1     t6, f6
+    mtc1     t7, f7
+    mtc1     t8, f8
+    cvt.s.w  f1, f1
+    cvt.s.w  f2, f2
+    cvt.s.w  f3, f3
+    cvt.s.w  f4, f4
+    cvt.s.w  f5, f5
+    cvt.s.w  f6, f6
+    cvt.s.w  f7, f7
+    cvt.s.w  f8, f8
+    swc1     f1, 224(a2)
+    swc1     f2, 228(a2)
+    swc1     f3, 232(a2)
+    swc1     f4, 236(a2)
+    swc1     f5, 240(a2)
+    swc1     f6, 244(a2)
+    swc1     f7, 248(a2)
+    swc1     f8, 252(a2)
+
+    j        ra
+     nop
+
+END(jsimd_convsamp_float_mips_dspr2)
+
+/*****************************************************************************/
+
diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h
new file mode 100644
index 0000000..50ec31b
--- /dev/null
+++ b/simd/jsimd_mips_dspr2_asm.h
@@ -0,0 +1,285 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero $0
+#define AT   $1
+#define v0   $2
+#define v1   $3
+#define a0   $4
+#define a1   $5
+#define a2   $6
+#define a3   $7
+#define t0   $8
+#define t1   $9
+#define t2   $10
+#define t3   $11
+#define t4   $12
+#define t5   $13
+#define t6   $14
+#define t7   $15
+#define s0   $16
+#define s1   $17
+#define s2   $18
+#define s3   $19
+#define s4   $20
+#define s5   $21
+#define s6   $22
+#define s7   $23
+#define t8   $24
+#define t9   $25
+#define k0   $26
+#define k1   $27
+#define gp   $28
+#define sp   $29
+#define fp   $30
+#define s8   $30
+#define ra   $31
+
+#define f0   $f0
+#define f1   $f1
+#define f2   $f2
+#define f3   $f3
+#define f4   $f4
+#define f5   $f5
+#define f6   $f6
+#define f7   $f7
+#define f8   $f8
+#define f9   $f9
+#define f10  $f10
+#define f11  $f11
+#define f12  $f12
+#define f13  $f13
+#define f14  $f14
+#define f15  $f15
+#define f16  $f16
+#define f17  $f17
+#define f18  $f18
+#define f19  $f19
+#define f20  $f20
+#define f21  $f21
+#define f22  $f22
+#define f23  $f23
+#define f24  $f24
+#define f25  $f25
+#define f26  $f26
+#define f27  $f27
+#define f28  $f28
+#define f29  $f29
+#define f30  $f30
+#define f31  $f31
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol)                           \
+                .globl  symbol;                         \
+                .align  2;                              \
+                .type   symbol, @function;              \
+                .ent    symbol, 0;                      \
+symbol:         .frame  sp, 0, ra;                      \
+                .set    push;                           \
+                .set    arch=mips32r2;                  \
+                .set    noreorder;                      \
+                .set    noat;
+
+/*
+ * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_MIPS_DSPR2(symbol)                         \
+LEAF_MIPS32R2(symbol)                                   \
+                .set    dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function)                                   \
+                .set    pop;                            \
+                .end    function;                       \
+                .size   function,.-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+                          r2  = 0, r3  = 0, r4  = 0, \
+                          r5  = 0, r6  = 0, r7  = 0, \
+                          r8  = 0, r9  = 0, r10 = 0, \
+                          r11 = 0, r12 = 0, r13 = 0, \
+                          r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, -\stack_offset
+    .endif
+    sw              \r1, 0(sp)
+    .if \r2 != 0
+    sw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    sw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    sw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw              \r14, 52(sp)
+    .endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+                               r2  = 0, r3  = 0, r4  = 0, \
+                               r5  = 0, r6  = 0, r7  = 0, \
+                               r8  = 0, r9  = 0, r10 = 0, \
+                               r11 = 0, r12 = 0, r13 = 0, \
+                               r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    lw              \r1, 0(sp)
+    .if \r2 != 0
+    lw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    lw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    lw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw              \r14, 52(sp)
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, \stack_offset
+    .endif
+.endm
+
+
diff --git a/tjbench.c b/tjbench.c
index 1f88e13..7a7bc92 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -29,6 +29,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <ctype.h>
 #include <math.h>
 #include <errno.h>
 #include <cdjpeg.h>
@@ -44,9 +45,9 @@
 #define _throwtj(m) _throw(m, tjGetErrorStr())
 #define _throwbmp(m) _throw(m, bmpgeterr())
 
-enum {YUVENCODE=1, YUVDECODE};
-int flags=TJFLAG_NOREALLOC, decomponly=0, yuv=0, quiet=0, dotile=0,
-	pf=TJPF_BGR;
+enum {YUVENCODE=1, YUVDECODE, YUVCOMPRESS};
+int flags=TJFLAG_NOREALLOC, componly=0, decomponly=0, yuv=0, quiet=0, dotile=0,
+	pf=TJPF_BGR, yuvpad=1;
 char *ext="ppm";
 const char *pixFormatStr[TJ_NUMPF]=
 {
@@ -54,15 +55,31 @@
 };
 const char *subNameLong[TJ_NUMSAMP]=
 {
-	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
 };
-const char *subName[NUMSUBOPT]={"444", "422", "420", "GRAY", "440"};
+const char *csName[TJ_NUMCS]=
+{
+	"RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
+};
+const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440", "411"};
 tjscalingfactor *scalingfactors=NULL, sf={1, 1};  int nsf=0;
 int xformop=TJXOP_NONE, xformopt=0;
 int (*customFilter)(short *, tjregion, tjregion, int, int, tjtransform *);
 double benchtime=5.0;
 
 
+char *formatName(int subsamp, int cs, char *buf)
+{
+	if(cs==TJCS_YCbCr) return (char *)subNameLong[subsamp];
+	else if(cs==TJCS_YCCK)
+	{
+		snprintf(buf, 80, "%s %s", csName[cs], subNameLong[subsamp]);
+		return buf;
+	}
+	else return (char *)csName[cs];
+}
+
+
 char *sigfig(double val, int figs, char *buf, int len)
 {
 	char format[80];
@@ -94,9 +111,9 @@
 	int row, col, i, dstbufalloc=0, retval=0;
 	double start, elapsed;
 	int ps=tjPixelSize[pf];
-	int yuvsize=tjBufSizeYUV(w, h, subsamp), bufsize;
-	int scaledw=(yuv==YUVDECODE)? w : TJSCALED(w, sf);
-	int scaledh=(yuv==YUVDECODE)? h : TJSCALED(h, sf);
+	int scaledw=TJSCALED(w, sf);
+	int scaledh=TJSCALED(h, sf);
+	int yuvsize=tjBufSizeYUV2(scaledw, yuvpad, scaledh, subsamp), bufsize;
 	int pitch=scaledw*ps;
 	int ntilesw=(w+tilew-1)/tilew, ntilesh=(h+tileh-1)/tileh;
 	unsigned char *dstptr, *dstptr2;
@@ -124,8 +141,9 @@
 	/* Execute once to preload cache */
 	if(yuv==YUVDECODE)
 	{
-		if(tjDecompressToYUV(handle, jpegbuf[0], jpegsize[0], dstbuf, flags)==-1)
-			_throwtj("executing tjDecompressToYUV()");
+		if(tjDecompressToYUV2(handle, jpegbuf[0], jpegsize[0], dstbuf, scaledw,
+			yuvpad, scaledh, flags)==-1)
+			_throwtj("executing tjDecompressToYUV2()");
 	}
 	else if(tjDecompress2(handle, jpegbuf[0], jpegsize[0], dstbuf, scaledw,
 		pitch, scaledh, pf, flags)==-1)
@@ -137,8 +155,9 @@
 		int tile=0;
 		if(yuv==YUVDECODE)
 		{
-			if(tjDecompressToYUV(handle, jpegbuf[0], jpegsize[0], dstbuf, flags)==-1)
-			_throwtj("executing tjDecompressToYUV()");
+			if(tjDecompressToYUV2(handle, jpegbuf[0], jpegsize[0], dstbuf, scaledw,
+				yuvpad, scaledh, flags)==-1)
+				_throwtj("executing tjDecompressToYUV2()");
 		}
 		else for(row=0, dstptr=dstbuf; row<ntilesh; row++, dstptr+=pitch*tileh)
 		{
@@ -167,10 +186,19 @@
 		printf("     Dest. throughput:     %f Megapixels/sec\n",
 			(double)(w*h)/1000000.*(double)i/elapsed);
 	}
+	if(sf.num!=1 || sf.denom!=1)
+		snprintf(sizestr, 20, "%d_%d", sf.num, sf.denom);
+	else if(tilew!=w || tileh!=h)
+		snprintf(sizestr, 20, "%dx%d", tilew, tileh);
+	else snprintf(sizestr, 20, "full");
+	if(decomponly)
+		snprintf(tempstr, 1024, "%s_%s.%s", filename, sizestr, ext);
+	else
+		snprintf(tempstr, 1024, "%s_%s%s_%s.%s", filename, subName[subsamp],
+			qualstr, sizestr, ext);
+
 	if(yuv==YUVDECODE)
 	{
-		snprintf(tempstr, 1024, "%s_%s%s.yuv", filename, subName[subsamp],
-			qualstr);
 		if((file=fopen(tempstr, "wb"))==NULL)
 			_throwunix("opening YUV image for output");
 		if(fwrite(dstbuf, yuvsize, 1, file)!=1)
@@ -179,16 +207,6 @@
 	}
 	else
 	{
-		if(sf.num!=1 || sf.denom!=1)
-			snprintf(sizestr, 20, "%d_%d", sf.num, sf.denom);
-		else if(tilew!=w || tileh!=h)
-			snprintf(sizestr, 20, "%dx%d", tilew, tileh);
-		else snprintf(sizestr, 20, "full");
-		if(decomponly)
-			snprintf(tempstr, 1024, "%s_%s.%s", filename, sizestr, ext);
-		else
-			snprintf(tempstr, 1024, "%s_%s%s_%s.%s", filename, subName[subsamp],
-				qualstr, sizestr, ext);
 		if(savebmp(tempstr, dstbuf, scaledw, scaledh, pf,
 			(flags&TJFLAG_BOTTOMUP)!=0)==-1)
 			_throwbmp("saving bitmap");
@@ -323,7 +341,9 @@
 	double start, elapsed;
 	int totaljpegsize=0, row, col, i, tilew=w, tileh=h, retval=0;
 	unsigned long *jpegsize=NULL;
-	int ps=tjPixelSize[pf], ntilesw=1, ntilesh=1, pitch=w*ps;
+	int ps=(yuv==YUVCOMPRESS)? 3:tjPixelSize[pf];
+	int ntilesw=1, ntilesh=1, pitch=w*ps;
+	const char *pfStr=(yuv==YUVCOMPRESS)? "YUV":pixFormatStr[pf];
 
 	if(yuv==YUVENCODE) {dotestyuv(srcbuf, w, h, subsamp, filename);  return;}
 
@@ -331,7 +351,7 @@
 		_throwunix("allocating temporary image buffer");
 
 	if(!quiet)
-		printf(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pixFormatStr[pf],
+		printf(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pfStr,
 			(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down", subNameLong[subsamp],
 			jpegqual);
 
@@ -359,17 +379,27 @@
 
 		/* Compression test */
 		if(quiet==1)
-			printf("%s\t%s\t%s\t%d\t", pixFormatStr[pf],
-				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subNameLong[subsamp], jpegqual);
-		for(i=0; i<h; i++)
-			memcpy(&tmpbuf[pitch*i], &srcbuf[w*ps*i], w*ps);
+			printf("%s\t%s\t%s\t%d\t", pfStr, (flags&TJFLAG_BOTTOMUP)? "BU":"TD",
+				subNameLong[subsamp], jpegqual);
+		if(yuv!=YUVCOMPRESS)
+			for(i=0; i<h; i++)
+				memcpy(&tmpbuf[pitch*i], &srcbuf[w*ps*i], w*ps);
 		if((handle=tjInitCompress())==NULL)
 			_throwtj("executing tjInitCompress()");
 
 		/* Execute once to preload cache */
-		if(tjCompress2(handle, srcbuf, tilew, pitch, tileh, pf, &jpegbuf[0],
-			&jpegsize[0], subsamp, jpegqual, flags)==-1)
-			_throwtj("executing tjCompress2()");
+		if(yuv==YUVCOMPRESS)
+		{
+			if(tjCompressFromYUV(handle, srcbuf, tilew, yuvpad, tileh, subsamp,
+				&jpegbuf[0], &jpegsize[0], jpegqual, flags)==-1)
+				_throwtj("executing tjCompressFromYUV()");
+		}
+		else
+		{
+			if(tjCompress2(handle, srcbuf, tilew, pitch, tileh, pf, &jpegbuf[0],
+				&jpegsize[0], subsamp, jpegqual, flags)==-1)
+				_throwtj("executing tjCompress2()");
+		}
 
 		/* Benchmark */
 		for(i=0, start=gettime(); (elapsed=gettime()-start)<benchtime; i++)
@@ -383,9 +413,18 @@
 				{
 					int width=min(tilew, w-col*tilew);
 					int height=min(tileh, h-row*tileh);
-					if(tjCompress2(handle, srcptr2, width, pitch, height, pf,
-						&jpegbuf[tile], &jpegsize[tile], subsamp, jpegqual, flags)==-1)
-						_throwtj("executing tjCompress()2");
+					if(yuv==YUVCOMPRESS)
+					{
+						if(tjCompressFromYUV(handle, srcptr2, width, yuvpad, height,
+							subsamp, &jpegbuf[tile], &jpegsize[tile], jpegqual, flags)==-1)
+							_throwtj("executing tjCompressFromYUV()");
+					}
+					else
+					{
+						if(tjCompress2(handle, srcptr2, width, pitch, height, pf,
+							&jpegbuf[tile], &jpegsize[tile], subsamp, jpegqual, flags)==-1)
+							_throwtj("executing tjCompress2()");
+					}
 					totaljpegsize+=jpegsize[tile];
 				}
 			}
@@ -429,9 +468,12 @@
 		}
 
 		/* Decompression test */
-		if(decomptest(srcbuf, jpegbuf, jpegsize, tmpbuf, w, h, subsamp, jpegqual,
-			filename, tilew, tileh)==-1)
-			goto bailout;
+		if(!componly)
+		{
+			if(decomptest(srcbuf, jpegbuf, jpegsize, tmpbuf, w, h, subsamp, jpegqual,
+				filename, tilew, tileh)==-1)
+				goto bailout;
+		}
 
 		for(i=0; i<ntilesw*ntilesh; i++)
 		{
@@ -466,7 +508,7 @@
 	unsigned char **jpegbuf=NULL, *srcbuf=NULL;
 	unsigned long *jpegsize=NULL, srcsize, totaljpegsize;
 	tjtransform *t=NULL;
-	int w=0, h=0, subsamp=-1, _w, _h, _tilew, _tileh,
+	int w=0, h=0, subsamp=-1, cs=-1, _w, _h, _tilew, _tileh,
 		_ntilesw, _ntilesh, _subsamp;
 	char *temp=NULL, tempstr[80], tempstr2[80];
 	int row, col, i, tilew, tileh, ntilesw=1, ntilesh=1, retval=0;
@@ -490,20 +532,25 @@
 
 	if((handle=tjInitTransform())==NULL)
 		_throwtj("executing tjInitTransform()");
-	if(tjDecompressHeader2(handle, srcbuf, srcsize, &w, &h, &subsamp)==-1)
-		_throwtj("executing tjDecompressHeader2()");
+	if(tjDecompressHeader3(handle, srcbuf, srcsize, &w, &h, &subsamp, &cs)==-1)
+		_throwtj("executing tjDecompressHeader3()");
 
 	if(quiet==1)
 	{
 		printf("All performance values in Mpixels/sec\n\n");
-		printf("Bitmap\tBitmap\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
+		printf("Bitmap\tBitmap\tJPEG\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
 			dotile? "Tile ":"Image", dotile? "Tile ":"Image");
-		printf("Format\tOrder\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n\n");
+		printf("Format\tOrder\tCS\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n\n");
 	}
 	else if(!quiet)
 	{
-		printf(">>>>>  JPEG %s --> %s (%s)  <<<<<\n", subNameLong[subsamp],
-			pixFormatStr[pf], (flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down");
+		if(yuv==YUVDECODE)
+			printf(">>>>>  JPEG %s --> YUV  <<<<<\n",
+				formatName(subsamp, cs, tempstr));
+		else
+			printf(">>>>>  JPEG %s --> %s (%s)  <<<<<\n",
+				formatName(subsamp, cs, tempstr), pixFormatStr[pf],
+				(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down");
 	}
 
 	for(tilew=dotile? 16:w, tileh=dotile? 16:h; ; tilew*=2, tileh*=2)
@@ -539,8 +586,8 @@
 		}
 		else if(quiet==1)
 		{
-			printf("%s\t%s\t%s\t", pixFormatStr[pf],
-				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subNameLong[subsamp]);
+			printf("%s\t%s\t%s\t%s\t", pixFormatStr[pf],
+				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", csName[cs], subNameLong[subsamp]);
 			printf("%-4d  %-4d\t", tilew, tileh);
 		}
 
@@ -667,7 +714,7 @@
 {
 	int i;
 	printf("USAGE: %s\n", progname);
-	printf("       <Inputfile (BMP|PPM)> <Quality> [options]\n\n");
+	printf("       <Inputfile (BMP|PPM|YUV)> <Quality> [options]\n\n");
 	printf("       %s\n", progname);
 	printf("       <Inputfile (JPG)> [options]\n\n");
 	printf("Options:\n\n");
@@ -676,8 +723,6 @@
 	printf("-bottomup = Test bottom-up compression/decompression\n");
 	printf("-tile = Test performance of the codec when the image is encoded as separate\n");
 	printf("     tiles of varying sizes.\n");
-	printf("-forcemmx, -forcesse, -forcesse2, -forcesse3 =\n");
-	printf("     Force MMX, SSE, SSE2, or SSE3 code paths in the underlying codec\n");
 	printf("-rgb, -bgr, -rgbx, -bgrx, -xbgr, -xrgb =\n");
 	printf("     Test the specified color conversion path in the codec (default: BGR)\n");
 	printf("-fastupsample = Use the fastest chrominance upsampling algorithm available in\n");
@@ -686,10 +731,21 @@
 	printf("     codec\n");
 	printf("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the\n");
 	printf("     underlying codec\n");
-	printf("-440 = Test 4:4:0 chrominance subsampling instead of 4:2:2\n");
+	printf("-subsamp <s> = if compressing a JPEG image from a YUV planar source image,\n");
+	printf("     this specifies the level of chrominance subsampling used in the source\n");
+	printf("     image.  Otherwise, this specifies the level of chrominance subsampling\n");
+	printf("     to use in the JPEG destination image.  <s> = 444, 422, 440, 420, 411,\n");
+	printf("     or GRAY\n");
 	printf("-quiet = Output results in tabular rather than verbose format\n");
 	printf("-yuvencode = Encode RGB input as planar YUV rather than compressing as JPEG\n");
 	printf("-yuvdecode = Decode JPEG image to planar YUV rather than RGB\n");
+	printf("-yuvsize WxH = if compressing a JPEG image from a YUV planar source image, this\n");
+	printf("     specifies the width and height of the source image.\n");
+	printf("-yuvpad <p> = if compressing a JPEG image from a YUV planar source image, this\n");
+	printf("     specifies the number of bytes to which each row of each plane in the\n");
+	printf("     source image is padded.  If decompressing a JPEG image to a YUV planar\n");
+	printf("     destination image, this specifies the row padding for each plane of the\n");
+	printf("     destination image. (default=1)\n");
 	printf("-scale M/N = scale down the width/height of the decompressed JPEG image by a\n");
 	printf("     factor of M/N (M/N = ");
 	for(i=0; i<nsf; i++)
@@ -709,7 +765,8 @@
 	printf("     decompression (these options are mutually exclusive)\n");
 	printf("-grayscale = Perform lossless grayscale conversion prior to decompression\n");
 	printf("     test (can be combined with the other transforms above)\n");
-	printf("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)\n\n");
+	printf("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)\n");
+	printf("-componly = Stop after running compression tests.  Do not test decompression.\n\n");
 	printf("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate\n");
 	printf("test will be performed for all quality values in the range.\n\n");
 	exit(1);
@@ -718,9 +775,9 @@
 
 int main(int argc, char *argv[])
 {
-	unsigned char *srcbuf=NULL;  int w, h, i, j;
+	unsigned char *srcbuf=NULL;  int w=0, h=0, i, j;
 	int minqual=-1, maxqual=-1;  char *temp;
-	int minarg=2, retval=0, do440=0;
+	int minarg=2, retval=0, subsamp=-1;
 
 	if((scalingfactors=tjGetScalingFactors(&nsf))==NULL || nsf==0)
 		_throwtj("executing tjGetScalingFactors()");
@@ -732,6 +789,7 @@
 	{
 		if(!strcasecmp(temp, ".bmp")) ext="bmp";
 		if(!strcasecmp(temp, ".jpg") || !strcasecmp(temp, ".jpeg")) decomponly=1;
+		if(!strcasecmp(temp, ".yuv")) yuv=YUVCOMPRESS;
 	}
 
 	printf("\n");
@@ -776,26 +834,6 @@
 			{
 				dotile=1;  xformopt|=TJXOPT_CROP;
 			}
-			if(!strcasecmp(argv[i], "-forcesse3"))
-			{
-				printf("Forcing SSE3 code\n\n");
-				flags|=TJFLAG_FORCESSE3;
-			}
-			if(!strcasecmp(argv[i], "-forcesse2"))
-			{
-				printf("Forcing SSE2 code\n\n");
-				flags|=TJFLAG_FORCESSE2;
-			}
-			if(!strcasecmp(argv[i], "-forcesse"))
-			{
-				printf("Forcing SSE code\n\n");
-				flags|=TJFLAG_FORCESSE;
-			}
-			if(!strcasecmp(argv[i], "-forcemmx"))
-			{
-				printf("Forcing MMX code\n\n");
-				flags|=TJFLAG_FORCEMMX;
-			}
 			if(!strcasecmp(argv[i], "-fastupsample"))
 			{
 				printf("Using fast upsampling code\n\n");
@@ -811,7 +849,6 @@
 				printf("Using most accurate DCT/IDCT algorithm\n\n");
 				flags|=TJFLAG_ACCURATEDCT;
 			}
-			if(!strcmp(argv[i], "-440")) do440=1;
 			if(!strcasecmp(argv[i], "-rgb")) pf=TJPF_RGB;
 			if(!strcasecmp(argv[i], "-rgbx")) pf=TJPF_RGBX;
 			if(!strcasecmp(argv[i], "-bgr")) pf=TJPF_BGR;
@@ -858,9 +895,44 @@
 			if(!strcmp(argv[i], "-?")) usage(argv[0]);
 			if(!strcasecmp(argv[i], "-alloc")) flags&=(~TJFLAG_NOREALLOC);
 			if(!strcasecmp(argv[i], "-bmp")) ext="bmp";
+			if(!strcasecmp(argv[i], "-yuvsize") && i<argc-1)
+			{
+				int temp1=0, temp2=0;
+				if(sscanf(argv[++i], "%dx%d", &temp1, &temp2)==2 && temp1>=1
+					&& temp2>=1)
+				{
+					w=temp1;  h=temp2;
+				}
+				else usage(argv[0]);
+			}
+			if(!strcasecmp(argv[i], "-yuvpad") && i<argc-1)
+			{
+				int temp=atoi(argv[++i]);
+				if(temp>=1) yuvpad=temp;
+			}
+			if(!strcasecmp(argv[i], "-subsamp") && i<argc-1)
+			{
+				i++;
+				if(toupper(argv[i][0])=='G') subsamp=TJSAMP_GRAY;
+				else
+				{
+					int temp=atoi(argv[i]);
+					switch(temp)
+					{
+						case 444:  subsamp=TJSAMP_444;  break;
+						case 422:  subsamp=TJSAMP_422;  break;
+						case 440:  subsamp=TJSAMP_440;  break;
+						case 420:  subsamp=TJSAMP_420;  break;
+						case 411:  subsamp=TJSAMP_411;  break;
+					}
+				}
+			}
+			if(!strcasecmp(argv[i], "-componly")) componly=1;
 		}
 	}
 
+	if(yuv) ext="yuv";
+
 	if((sf.num!=1 || sf.denom!=1) && dotile)
 	{
 		printf("Disabling tiled compression/decompression tests, because those tests do not\n");
@@ -871,14 +943,38 @@
 	if(yuv && dotile)
 	{
 		printf("Disabling tiled compression/decompression tests, because those tests do not\n");
-		printf("work when YUV encoding or decoding is enabled.\n\n");
+		printf("work when YUV encoding, compression, or decoding is enabled.\n\n");
 		dotile=0;
 	}
 
 	if(!decomponly)
 	{
-		if(loadbmp(argv[1], &srcbuf, &w, &h, pf, (flags&TJFLAG_BOTTOMUP)!=0)==-1)
-			_throwbmp("loading bitmap");
+		if(yuv==YUVCOMPRESS)
+		{
+			FILE *file=NULL;  unsigned long srcsize;
+			if(w<1 || h<1 || subsamp<0 || subsamp>=TJ_NUMSAMP)
+				_throw("opening YUV image file",
+					"YUV image size and/or subsampling not specified");
+			if((file=fopen(argv[1], "rb"))==NULL)
+				_throwunix("opening YUV image file");
+			if(fseek(file, 0, SEEK_END)<0 ||
+				(srcsize=ftell(file))==(unsigned long)-1)
+				_throwunix("determining YUV image file size");
+			if(srcsize!=tjBufSizeYUV2(w, yuvpad, h, subsamp))
+				_throw("opening YUV image file", "YUV image file is the wrong size");
+			if((srcbuf=(unsigned char *)malloc(srcsize))==NULL)
+				_throwunix("allocating memory for YUV image");
+			if(fseek(file, 0, SEEK_SET)<0)
+				_throwunix("setting YUV image file position");
+			if(fread(srcbuf, srcsize, 1, file)<1)
+				_throwunix("reading YUV data");
+			fclose(file);  file=NULL;
+		}
+		else
+		{
+			if(loadbmp(argv[1], &srcbuf, &w, &h, pf, (flags&TJFLAG_BOTTOMUP)!=0)==-1)
+				_throwbmp("loading bitmap");
+		}
 		temp=strrchr(argv[1], '.');
 		if(temp!=NULL) *temp='\0';
 	}
@@ -897,18 +993,27 @@
 		printf("\n");
 		goto bailout;
 	}
-	for(i=maxqual; i>=minqual; i--)
-		dotest(srcbuf, w, h, TJ_GRAYSCALE, i, argv[1]);
-	printf("\n");
-	for(i=maxqual; i>=minqual; i--)
-		dotest(srcbuf, w, h, TJ_420, i, argv[1]);
-	printf("\n");
-	for(i=maxqual; i>=minqual; i--)
-		dotest(srcbuf, w, h, do440? TJSAMP_440:TJ_422, i, argv[1]);
-	printf("\n");
-	for(i=maxqual; i>=minqual; i--)
-		dotest(srcbuf, w, h, TJ_444, i, argv[1]);
-	printf("\n");
+	if(yuv==YUVCOMPRESS || (subsamp>=0 && subsamp<TJ_NUMSAMP))
+	{
+		for(i=maxqual; i>=minqual; i--)
+			dotest(srcbuf, w, h, subsamp, i, argv[1]);
+		printf("\n");
+	}
+	else
+	{
+		for(i=maxqual; i>=minqual; i--)
+			dotest(srcbuf, w, h, TJSAMP_GRAY, i, argv[1]);
+		printf("\n");
+		for(i=maxqual; i>=minqual; i--)
+			dotest(srcbuf, w, h, TJSAMP_420, i, argv[1]);
+		printf("\n");
+		for(i=maxqual; i>=minqual; i--)
+			dotest(srcbuf, w, h, TJSAMP_422, i, argv[1]);
+		printf("\n");
+		for(i=maxqual; i>=minqual; i--)
+			dotest(srcbuf, w, h, TJSAMP_444, i, argv[1]);
+		printf("\n");
+	}
 
 	bailout:
 	if(srcbuf) free(srcbuf);
diff --git a/tjunittest.c b/tjunittest.c
index 3bb194d..d2d3e98 100644
--- a/tjunittest.c
+++ b/tjunittest.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2012, 2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2014 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,8 @@
 	printf("\nUSAGE: %s [options]\n", progName);
 	printf("Options:\n");
 	printf("-yuv = test YUV encoding/decoding support\n");
+	printf("-noyuvpad = do not pad each line of each Y, U, and V plane to the nearest\n");
+	printf("            4-byte boundary\n");
 	printf("-alloc = test automatic buffer allocation\n");
 	exit(1);
 }
@@ -59,25 +61,26 @@
 
 const char *subNameLong[TJ_NUMSAMP]=
 {
-	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
 };
-const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440"};
+const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440", "411"};
 
 const char *pixFormatStr[TJ_NUMPF]=
 {
 	"RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "Grayscale",
-	"RGBA", "BGRA", "ABGR", "ARGB"
+	"RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
 };
 
-const int alphaOffset[TJ_NUMPF] = {-1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0};
+const int alphaOffset[TJ_NUMPF] = {-1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1};
 
 const int _3byteFormats[]={TJPF_RGB, TJPF_BGR};
-const int _4byteFormats[]={TJPF_RGBX, TJPF_BGRX, TJPF_XBGR, TJPF_XRGB};
+const int _4byteFormats[]={TJPF_RGBX, TJPF_BGRX, TJPF_XBGR, TJPF_XRGB,
+	TJPF_CMYK};
 const int _onlyGray[]={TJPF_GRAY};
 const int _onlyRGB[]={TJPF_RGB};
 
 enum {YUVENCODE=1, YUVDECODE};
-int yuv=0, alloc=0;
+int doyuv=0, alloc=0, pad=4;
 
 int exitStatus=0;
 #define bailout() {exitStatus=-1;  goto bailout;}
@@ -91,9 +94,9 @@
 	int ps=tjPixelSize[pf];
 	int index, row, col, halfway=16;
 
-	memset(buf, 0, w*h*ps);
 	if(pf==TJPF_GRAY)
 	{
+		memset(buf, 0, w*h*ps);
 		for(row=0; row<h; row++)
 		{
 			for(col=0; col<w; col++)
@@ -105,8 +108,30 @@
 			}
 		}
 	}
+	else if(pf==TJPF_CMYK)
+	{
+		memset(buf, 255, w*h*ps);
+		for(row=0; row<h; row++)
+		{
+			for(col=0; col<w; col++)
+			{
+				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
+				else index=row*w+col;
+				if(((row/8)+(col/8))%2==0)
+				{
+					if(row>=halfway) buf[index*ps+3]=0;
+				}
+				else
+				{
+					buf[index*ps+2]=0;
+					if(row<halfway) buf[index*ps+1]=0;
+				}
+			}
+		}
+	}
 	else
 	{
+		memset(buf, 0, w*h*ps);
 		for(row=0; row<h; row++)
 		{
 			for(col=0; col<w; col++)
@@ -165,6 +190,36 @@
 	int halfway=16*sf.num/sf.denom;
 	int blocksize=8*sf.num/sf.denom;
 
+	if(pf==TJPF_CMYK)
+	{
+		for(row=0; row<h; row++)
+		{
+			for(col=0; col<w; col++)
+			{
+				unsigned char c, m, y, k;
+				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
+				else index=row*w+col;
+				c=buf[index*ps];
+				m=buf[index*ps+1];
+				y=buf[index*ps+2];
+				k=buf[index*ps+3];
+				if(((row/blocksize)+(col/blocksize))%2==0)
+				{
+					checkval255(c);  checkval255(m);  checkval255(y);
+					if(row<halfway) checkval255(k)
+					else checkval0(k)
+				}
+				else
+				{
+					checkval255(c);  checkval0(y);  checkval255(k);
+					if(row<halfway) checkval0(m)
+					else checkval255(m)
+				}
+			}
+		}
+		return 1;
+	}
+
 	for(row=0; row<h; row++)
 	{
 		for(col=0; col<w; col++)
@@ -223,8 +278,13 @@
 		{
 			for(col=0; col<w; col++)
 			{
-				printf("%.3d/%.3d/%.3d ", buf[(row*w+col)*ps+roffset],
-					buf[(row*w+col)*ps+goffset], buf[(row*w+col)*ps+boffset]);
+				if(pf==TJPF_CMYK)
+					printf("%.3d/%.3d/%.3d/%.3d ", buf[(row*w+col)*ps],
+						buf[(row*w+col)*ps+1], buf[(row*w+col)*ps+2],
+						buf[(row*w+col)*ps+3]);
+				else
+					printf("%.3d/%.3d/%.3d ", buf[(row*w+col)*ps+roffset],
+						buf[(row*w+col)*ps+goffset], buf[(row*w+col)*ps+boffset]);
 			}
 			printf("\n");
 		}
@@ -235,22 +295,24 @@
 
 #define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
 
-int checkBufYUV(unsigned char *buf, int w, int h, int subsamp)
+int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
+	tjscalingfactor sf)
 {
 	int row, col;
 	int hsf=tjMCUWidth[subsamp]/8, vsf=tjMCUHeight[subsamp]/8;
 	int pw=PAD(w, hsf), ph=PAD(h, vsf);
 	int cw=pw/hsf, ch=ph/vsf;
-	int ypitch=PAD(pw, 4), uvpitch=PAD(cw, 4);
+	int ypitch=PAD(pw, pad), uvpitch=PAD(cw, pad);
 	int retval=1;
-	int halfway=16;
+	int halfway=16*sf.num/sf.denom;
+	int blocksize=8*sf.num/sf.denom;
 
 	for(row=0; row<ph; row++)
 	{
 		for(col=0; col<pw; col++)
 		{
 			unsigned char y=buf[ypitch*row+col];
-			if(((row/8)+(col/8))%2==0)
+			if(((row/blocksize)+(col/blocksize))%2==0)
 			{
 				if(row<halfway) checkval255(y)  else checkval0(y);
 			}
@@ -262,14 +324,14 @@
 	}
 	if(subsamp!=TJSAMP_GRAY)
 	{
-		halfway=16/vsf;
+		int halfway=16/vsf*sf.num/sf.denom;
 		for(row=0; row<ch; row++)
 		{
 			for(col=0; col<cw; col++)
 			{
 				unsigned char u=buf[ypitch*ph + (uvpitch*row+col)],
 					v=buf[ypitch*ph + uvpitch*ch + (uvpitch*row+col)];
-				if(((row*vsf/8)+(col*hsf/8))%2==0)
+				if(((row*vsf/blocksize)+(col*hsf/blocksize))%2==0)
 				{
 					checkval(u, 128);  checkval(v, 128);
 				}
@@ -335,57 +397,60 @@
 	unsigned long *dstSize, int w, int h, int pf, char *basename,
 	int subsamp, int jpegQual, int flags)
 {
-	char tempStr[1024];  unsigned char *srcBuf=NULL;
-	double t;
-
-	if(yuv==YUVENCODE)
-		printf("%s %s -> %s YUV ... ", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ", subNameLong[subsamp]);
-	else
-		printf("%s %s -> %s Q%d ... ", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ", subNameLong[subsamp],
-			jpegQual);
+	char tempStr[1024];  unsigned char *srcBuf=NULL, *yuvBuf=NULL;
+	const char *pfStr=pixFormatStr[pf];
+	const char *buStrLong=(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ";
+	const char *buStr=(flags&TJFLAG_BOTTOMUP)? "BU":"TD";
 
 	if((srcBuf=(unsigned char *)malloc(w*h*tjPixelSize[pf]))==NULL)
 		_throw("Memory allocation failure");
 	initBuf(srcBuf, w, h, pf, flags);
+
 	if(*dstBuf && *dstSize>0) memset(*dstBuf, 0, *dstSize);
 
-	t=gettime();
-	if(yuv==YUVENCODE)
+
+	if(!alloc) flags|=TJFLAG_NOREALLOC;
+	if(doyuv)
 	{
-		_tj(tjEncodeYUV2(handle, srcBuf, w, 0, h, pf, *dstBuf, subsamp, flags));
+		unsigned long yuvSize=tjBufSizeYUV2(w, pad, h, subsamp);
+		tjscalingfactor sf={1, 1};
+		tjhandle handle2=tjInitCompress();
+		if(!handle2) _throwtj();
+
+		if((yuvBuf=(unsigned char *)malloc(yuvSize))==NULL)
+			_throw("Memory allocation failure");
+		memset(yuvBuf, 0, yuvSize);
+
+		printf("%s %s -> YUV %s ... ", pfStr, buStrLong, subNameLong[subsamp]);
+		_tj(tjEncodeYUV3(handle2, srcBuf, w, 0, h, pf, yuvBuf, pad, subsamp,
+			flags));
+		tjDestroy(handle2);
+		snprintf(tempStr, 1024, "%s_enc_%s_%s_%s.yuv", basename, pfStr, buStr,
+			subName[subsamp]);
+		writeJPEG(yuvBuf, yuvSize, tempStr);
+		if(checkBufYUV(yuvBuf, w, h, subsamp, sf)) printf("Passed.\n");
+		else printf("FAILED!\n");
+
+		printf("YUV %s %s -> JPEG Q%d ... ", subNameLong[subsamp], buStrLong,
+			jpegQual);
+		_tj(tjCompressFromYUV(handle, yuvBuf, w, pad, h, subsamp, dstBuf,
+			dstSize, jpegQual, flags));
 	}
 	else
 	{
-		if(!alloc)
-		{
-			flags|=TJFLAG_NOREALLOC;
-			*dstSize=(yuv==YUVENCODE? tjBufSizeYUV(w, h, subsamp)
-				: tjBufSize(w, h, subsamp));
-		}
+		printf("%s %s -> %s Q%d ... ", pfStr, buStrLong, subNameLong[subsamp],
+			jpegQual);
 		_tj(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
 			jpegQual, flags));
 	}
-	t=gettime()-t;
 
-	if(yuv==YUVENCODE)
-		snprintf(tempStr, 1024, "%s_enc_%s_%s_%s.yuv", basename, pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subName[subsamp]);
-	else
-		snprintf(tempStr, 1024, "%s_enc_%s_%s_%s_Q%d.jpg", basename,
-			pixFormatStr[pf], (flags&TJFLAG_BOTTOMUP)? "BU":"TD", subName[subsamp],
-			jpegQual);
+	snprintf(tempStr, 1024, "%s_enc_%s_%s_%s_Q%d.jpg", basename, pfStr, buStr,
+		subName[subsamp], jpegQual);
 	writeJPEG(*dstBuf, *dstSize, tempStr);
-	if(yuv==YUVENCODE)
-	{
-		if(checkBufYUV(*dstBuf, w, h, subsamp)) printf("Passed.");
-		else printf("FAILED!");
-	}
-	else printf("Done.");
-	printf("  %f ms\n  Result in %s\n", t*1000., tempStr);
+	printf("Done.\n  Result in %s\n", tempStr);
 
 	bailout:
+	if(yuvBuf) free(yuvBuf);
 	if(srcBuf) free(srcBuf);
 }
 
@@ -394,16 +459,49 @@
 	unsigned long jpegSize, int w, int h, int pf, char *basename, int subsamp,
 	int flags, tjscalingfactor sf)
 {
-	unsigned char *dstBuf=NULL;
-	int _hdrw=0, _hdrh=0, _hdrsubsamp=-1;  double t;
+	unsigned char *dstBuf=NULL, *yuvBuf=NULL;
+	int _hdrw=0, _hdrh=0, _hdrsubsamp=-1;
 	int scaledWidth=TJSCALED(w, sf);
 	int scaledHeight=TJSCALED(h, sf);
 	unsigned long dstSize=0;
 
-	if(yuv==YUVENCODE) return;
+	_tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
+		&_hdrsubsamp));
+	if(_hdrw!=w || _hdrh!=h || _hdrsubsamp!=subsamp)
+		_throw("Incorrect JPEG header");
 
-	if(yuv==YUVDECODE)
-		printf("JPEG -> YUV %s ... ", subNameLong[subsamp]);
+	dstSize=scaledWidth*scaledHeight*tjPixelSize[pf];
+	if((dstBuf=(unsigned char *)malloc(dstSize))==NULL)
+		_throw("Memory allocation failure");
+	memset(dstBuf, 0, dstSize);
+
+	if(doyuv)
+	{
+		unsigned long yuvSize=tjBufSizeYUV2(scaledWidth, pad, scaledHeight,
+			subsamp);
+		tjhandle handle2=tjInitDecompress();
+		if(!handle2) _throwtj();
+
+		if((yuvBuf=(unsigned char *)malloc(yuvSize))==NULL)
+			_throw("Memory allocation failure");
+		memset(yuvBuf, 0, yuvSize);
+
+		printf("JPEG -> YUV %s ", subNameLong[subsamp]);
+		if(sf.num!=1 || sf.denom!=1)
+			printf("%d/%d ... ", sf.num, sf.denom);
+		else printf("... ");
+		_tj(tjDecompressToYUV2(handle, jpegBuf, jpegSize, yuvBuf, scaledWidth,
+			pad, scaledHeight, flags));
+		if(checkBufYUV(yuvBuf, scaledWidth, scaledHeight, subsamp, sf))
+			printf("Passed.\n");
+		else printf("FAILED!\n");
+
+		printf("YUV %s -> %s %s ... ", subNameLong[subsamp], pixFormatStr[pf],
+			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ");
+		_tj(tjDecodeYUV(handle2, yuvBuf, pad, subsamp, dstBuf, scaledWidth, 0,
+			scaledHeight, pf, flags));
+		tjDestroy(handle2);
+	}
 	else
 	{
 		printf("JPEG -> %s %s ", pixFormatStr[pf],
@@ -411,45 +509,17 @@
 		if(sf.num!=1 || sf.denom!=1)
 			printf("%d/%d ... ", sf.num, sf.denom);
 		else printf("... ");
-	}
-
-	_tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
-		&_hdrsubsamp));
-	if(_hdrw!=w || _hdrh!=h || _hdrsubsamp!=subsamp)
-		_throw("Incorrect JPEG header");
-
-	if(yuv==YUVDECODE) dstSize=tjBufSizeYUV(w, h, subsamp);
-	else dstSize=scaledWidth*scaledHeight*tjPixelSize[pf];
-	if((dstBuf=(unsigned char *)malloc(dstSize))==NULL)
-		_throw("Memory allocation failure");
-	memset(dstBuf, 0, dstSize);
-
-	t=gettime();
-	if(yuv==YUVDECODE)
-	{
-		_tj(tjDecompressToYUV(handle, jpegBuf, jpegSize, dstBuf, flags));
-	}
-	else
-	{
 		_tj(tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth, 0,
 			scaledHeight, pf, flags));
 	}
-	t=gettime()-t;
 
-	if(yuv==YUVDECODE)
-	{
-		if(checkBufYUV(dstBuf, w, h, subsamp)) printf("Passed.");
-		else printf("FAILED!");
-	}
-	else
-	{
-		if(checkBuf(dstBuf, scaledWidth, scaledHeight, pf, subsamp, sf, flags))
-			printf("Passed.");
-		else printf("FAILED!");
-	}
-	printf("  %f ms\n", t*1000.);
+	if(checkBuf(dstBuf, scaledWidth, scaledHeight, pf, subsamp, sf, flags))
+		printf("Passed.");
+	else printf("FAILED!");
+	printf("\n");
 
 	bailout:
+	if(yuvBuf) free(yuvBuf);
 	if(dstBuf) free(dstBuf);
 }
 
@@ -459,18 +529,19 @@
 	int flags)
 {
 	int i, n=0;
-	tjscalingfactor *sf=tjGetScalingFactors(&n), sf1={1, 1};
+	tjscalingfactor *sf=tjGetScalingFactors(&n);
 	if(!sf || !n) _throwtj();
 
-	if((subsamp==TJSAMP_444 || subsamp==TJSAMP_GRAY) && !yuv)
+	for(i=0; i<n; i++)
 	{
-		for(i=0; i<n; i++)
+		if(subsamp==TJSAMP_444 || subsamp==TJSAMP_GRAY ||
+			(subsamp==TJSAMP_411 && sf[i].num==1 &&
+				(sf[i].denom==2 || sf[i].denom==1)) ||
+			(subsamp!=TJSAMP_411 && sf[i].num==1 &&
+				(sf[i].denom==4 || sf[i].denom==2 || sf[i].denom==1)))
 			_decompTest(handle, jpegBuf, jpegSize, w, h, pf, basename, subsamp,
 				flags, sf[i]);
 	}
-	else
-		_decompTest(handle, jpegBuf, jpegSize, w, h, pf, basename, subsamp, flags,
-			sf1);
 
 	bailout:
 	return;
@@ -485,12 +556,10 @@
 	unsigned long size=0;  int pfi, pf, i;
 
 	if(!alloc)
-	{
-		size=(yuv==YUVENCODE? tjBufSizeYUV(w, h, subsamp)
-			: tjBufSize(w, h, subsamp));
+		size=tjBufSize(w, h, subsamp);
+	if(size!=0)
 		if((dstBuf=(unsigned char *)tjAlloc(size))==NULL)
 			_throw("Memory allocation failure.");
-	}
 
 	if((chandle=tjInitCompress())==NULL || (dhandle=tjInitDecompress())==NULL)
 		_throwtj();
@@ -500,13 +569,10 @@
 		for(i=0; i<2; i++)
 		{
 			int flags=0;
-			if(subsamp==TJSAMP_422 || subsamp==TJSAMP_420 || subsamp==TJSAMP_440)
+			if(subsamp==TJSAMP_422 || subsamp==TJSAMP_420 || subsamp==TJSAMP_440 ||
+				subsamp==TJSAMP_411)
 				flags|=TJFLAG_FASTUPSAMPLE;
-			if(i==1)
-			{
-				if(yuv==YUVDECODE) goto bailout;
-				else flags|=TJFLAG_BOTTOMUP;
-			}
+			if(i==1) flags|=TJFLAG_BOTTOMUP;
 			pf=formats[pfi];
 			compTest(chandle, &dstBuf, &size, w, h, pf, basename, subsamp, 100,
 				flags);
@@ -551,9 +617,9 @@
 				if(h%100==0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", w, h);
 				if((srcBuf=(unsigned char *)malloc(w*h*4))==NULL)
 					_throw("Memory allocation failure");
-				if(!alloc || yuv==YUVENCODE)
+				if(!alloc || doyuv)
 				{
-					if(yuv==YUVENCODE) dstSize=tjBufSizeYUV(w, h, subsamp);
+					if(doyuv) dstSize=tjBufSizeYUV2(w, pad, h, subsamp);
 					else dstSize=tjBufSize(w, h, subsamp);
 					if((dstBuf=(unsigned char *)tjAlloc(dstSize))==NULL)
 						_throw("Memory allocation failure");
@@ -565,10 +631,10 @@
 					else srcBuf[i]=255;
 				}
 
-				if(yuv==YUVENCODE)
+				if(doyuv)
 				{
-					_tj(tjEncodeYUV2(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, subsamp,
-						0));
+					_tj(tjEncodeYUV3(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, pad,
+						subsamp, 0));
 				}
 				else
 				{
@@ -580,9 +646,9 @@
 
 				if((srcBuf=(unsigned char *)malloc(h*w*4))==NULL)
 					_throw("Memory allocation failure");
-				if(!alloc || yuv==YUVENCODE)
+				if(!alloc || doyuv)
 				{
-					if(yuv==YUVENCODE) dstSize=tjBufSizeYUV(h, w, subsamp);
+					if(doyuv) dstSize=tjBufSizeYUV2(h, pad, w, subsamp);
 					else dstSize=tjBufSize(h, w, subsamp);
 					if((dstBuf=(unsigned char *)tjAlloc(dstSize))==NULL)
 						_throw("Memory allocation failure");
@@ -594,10 +660,10 @@
 					else srcBuf[i]=255;
 				}
 
-				if(yuv==YUVENCODE)
+				if(doyuv)
 				{
-					_tj(tjEncodeYUV2(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, subsamp,
-						0));
+					_tj(tjEncodeYUV3(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, pad,
+						subsamp, 0));
 				}
 				else
 				{
@@ -620,7 +686,7 @@
 
 int main(int argc, char *argv[])
 {
-	int doyuv=0, i;
+	int i, num4bf=5;
 	#ifdef _WIN32
 	srand((unsigned int)time(NULL));
 	#endif
@@ -629,41 +695,38 @@
 		for(i=1; i<argc; i++)
 		{
 			if(!strcasecmp(argv[i], "-yuv")) doyuv=1;
+			if(!strcasecmp(argv[i], "-noyuvpad")) pad=1;
 			if(!strcasecmp(argv[i], "-alloc")) alloc=1;
 			if(!strncasecmp(argv[i], "-h", 2) || !strcasecmp(argv[i], "-?"))
 				usage(argv[0]);
 		}
 	}
 	if(alloc) printf("Testing automatic buffer allocation\n");
-	if(doyuv) {yuv=YUVENCODE;  alloc=0;}
+	if(doyuv) num4bf=4;
 	doTest(35, 39, _3byteFormats, 2, TJSAMP_444, "test");
-	doTest(39, 41, _4byteFormats, 4, TJSAMP_444, "test");
+	doTest(39, 41, _4byteFormats, num4bf, TJSAMP_444, "test");
 	doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
-	doTest(35, 39, _4byteFormats, 4, TJSAMP_422, "test");
+	doTest(35, 39, _4byteFormats, num4bf, TJSAMP_422, "test");
 	doTest(39, 41, _3byteFormats, 2, TJSAMP_420, "test");
-	doTest(41, 35, _4byteFormats, 4, TJSAMP_420, "test");
+	doTest(41, 35, _4byteFormats, num4bf, TJSAMP_420, "test");
 	doTest(35, 39, _3byteFormats, 2, TJSAMP_440, "test");
-	doTest(39, 41, _4byteFormats, 4, TJSAMP_440, "test");
-	doTest(35, 39, _onlyGray, 1, TJSAMP_GRAY, "test");
-	doTest(39, 41, _3byteFormats, 2, TJSAMP_GRAY, "test");
-	doTest(41, 35, _4byteFormats, 4, TJSAMP_GRAY, "test");
+	doTest(39, 41, _4byteFormats, num4bf, TJSAMP_440, "test");
+	doTest(41, 35, _3byteFormats, 2, TJSAMP_411, "test");
+	doTest(35, 39, _4byteFormats, num4bf, TJSAMP_411, "test");
+	doTest(39, 41, _onlyGray, 1, TJSAMP_GRAY, "test");
+	doTest(41, 35, _3byteFormats, 2, TJSAMP_GRAY, "test");
+	doTest(35, 39, _4byteFormats, 4, TJSAMP_GRAY, "test");
 	bufSizeTest();
 	if(doyuv)
 	{
 		printf("\n--------------------\n\n");
-		yuv=YUVDECODE;
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_444, "test_yuv0");
-		doTest(35, 39, _onlyRGB, 1, TJSAMP_444, "test_yuv1");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_422, "test_yuv0");
-		doTest(39, 41, _onlyRGB, 1, TJSAMP_422, "test_yuv1");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_420, "test_yuv0");
-		doTest(41, 35, _onlyRGB, 1, TJSAMP_420, "test_yuv1");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_440, "test_yuv0");
-		doTest(35, 39, _onlyRGB, 1, TJSAMP_440, "test_yuv1");
+		doTest(48, 48, _onlyRGB, 1, TJSAMP_411, "test_yuv0");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv0");
-		doTest(35, 39, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv1");
 		doTest(48, 48, _onlyGray, 1, TJSAMP_GRAY, "test_yuv0");
-		doTest(39, 41, _onlyGray, 1, TJSAMP_GRAY, "test_yuv1");
 	}
 
 	return exitStatus;
diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
index 634bedf..efe5590 100644
--- a/turbojpeg-jni.c
+++ b/turbojpeg-jni.c
@@ -67,16 +67,23 @@
 	return retval;
 }
 
-JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV
-	(JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
+	(JNIEnv *env, jclass cls, jint width, jint pad, jint height, jint subsamp)
 {
-	jint retval=(jint)tjBufSizeYUV(width, height, subsamp);
+	jint retval=(jint)tjBufSizeYUV2(width, pad, height, subsamp);
 	if(retval==-1) _throw(tjGetErrorStr());
 
 	bailout:
 	return retval;
 }
 
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III
+	(JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
+{
+	return Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII(env, cls, width,
+		4, height, subsamp);
+}
+
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
 	(JNIEnv *env, jobject obj)
 {
@@ -207,12 +214,48 @@
 		flags);
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
-		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3BIIII_3BII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pad, jint height,
+		jint subsamp, jbyteArray dst, jint jpegQual, jint flags)
 {
 	tjhandle handle=0;
+	unsigned long jpegSize=0;
 	jsize arraySize=0;
+	unsigned char *srcBuf=NULL, *jpegBuf=NULL;
+
+	gethandle();
+
+	arraySize=tjBufSizeYUV2(width, pad, height, subsamp);
+	if((*env)->GetArrayLength(env, src)<arraySize)
+		_throw("Source buffer is not large enough");
+	jpegSize=tjBufSize(width, height, subsamp);
+	if((*env)->GetArrayLength(env, dst)<(jsize)jpegSize)
+		_throw("Destination buffer is not large enough");
+
+	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+	if(tjCompressFromYUV(handle, srcBuf, width, pad, height, subsamp, &jpegBuf,
+		&jpegSize, jpegQual, flags|TJFLAG_NOREALLOC)==-1)
+	{
+		(*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
+		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
+		jpegBuf=srcBuf=NULL;
+		_throw(tjGetErrorStr());
+	}
+
+	bailout:
+	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
+	if(srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
+	return (jint)jpegSize;
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BIII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
+		jint height, jint pf, jbyteArray dst, jint pad, jint subsamp, jint flags)
+{
+	tjhandle handle=0;
+	jsize arraySize=0, yuvSize;
 	unsigned char *srcBuf=NULL, *dstBuf=NULL;
 
 	gethandle();
@@ -226,15 +269,17 @@
 	arraySize=(pitch==0)? width*tjPixelSize[pf]*height:pitch*height;
 	if((*env)->GetArrayLength(env, src)<arraySize)
 		_throw("Source buffer is not large enough");
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(width, height, subsamp))
+	yuvSize=(jsize)tjBufSizeYUV2(width, pad, height, subsamp);
+	if(yuvSize==(unsigned long)-1)
+		_throw(tjGetErrorStr());
+	if((*env)->GetArrayLength(env, dst)<yuvSize)
 		_throw("Destination buffer is not large enough");
 
 	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
-		flags)==-1)
+	if(tjEncodeYUV3(handle, srcBuf, width, pitch, height, pf, dstBuf, pad,
+		subsamp, flags)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
 		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
@@ -248,12 +293,20 @@
 	return;
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
-	(JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
 		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
+	Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BIII(
+		env, obj, src, width, pitch, height, pf, dst, 4, subsamp, flags);
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BIII
+	(JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+		jint height, jint pf, jbyteArray dst, jint pad, jint subsamp, jint flags)
+{
 	tjhandle handle=0;
-	jsize arraySize=0;
+	jsize arraySize=0, yuvSize;
 	unsigned char *srcBuf=NULL, *dstBuf=NULL;
 
 	gethandle();
@@ -269,15 +322,17 @@
 	arraySize=(stride==0)? width*height:stride*height;
 	if((*env)->GetArrayLength(env, src)<arraySize)
 		_throw("Source buffer is not large enough");
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(width, height, subsamp))
+	yuvSize=(jsize)tjBufSizeYUV2(width, pad, height, subsamp);
+	if(yuvSize==(unsigned long)-1)
+		_throw(tjGetErrorStr());
+	if((*env)->GetArrayLength(env, dst)<yuvSize)
 		_throw("Destination buffer is not large enough");
 
 	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjEncodeYUV2(handle, srcBuf, width, stride*sizeof(jint), height, pf,
-		dstBuf, subsamp, flags)==-1)
+	if(tjEncodeYUV3(handle, srcBuf, width, stride*sizeof(jint), height, pf,
+		dstBuf, pad, subsamp, flags)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
 		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
@@ -291,6 +346,14 @@
 	return;
 }
 
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
+	(JNIEnv *env, jobject obj, jintArray src, jint width, jint pitch,
+		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+{
+	Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BIII(
+		env, obj, src, width, pitch, height, pf, dst, 4, subsamp, flags);
+}
+
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
 	(JNIEnv *env, jobject obj)
 {
@@ -355,7 +418,7 @@
 {
 	tjhandle handle=0;
 	unsigned char *jpegBuf=NULL;
-	int width=0, height=0, jpegSubsamp=-1;
+	int width=0, height=0, jpegSubsamp=-1, jpegColorspace=-1;
 
 	gethandle();
 
@@ -364,8 +427,8 @@
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 
-	if(tjDecompressHeader2(handle, jpegBuf, (unsigned long)jpegSize, 
-		&width, &height, &jpegSubsamp)==-1)
+	if(tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize, 
+		&width, &height, &jpegSubsamp, &jpegColorspace)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
 		_throw(tjGetErrorStr());
@@ -374,6 +437,8 @@
 
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
 	(*env)->SetIntField(env, obj, _fid, jpegSubsamp);
+	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegColorspace", "I"));
+	(*env)->SetIntField(env, obj, _fid, jpegColorspace);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
 	(*env)->SetIntField(env, obj, _fid, width);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
@@ -484,13 +549,14 @@
 	
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BIIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
-		jint flags)
+		jint desiredWidth, jint pad, jint desiredHeight, jint flags)
 {
 	tjhandle handle=0;
 	unsigned char *jpegBuf=NULL, *dstBuf=NULL;
 	int jpegSubsamp=-1, jpegWidth=0, jpegHeight=0;
+	jsize yuvSize;
 
 	gethandle();
 
@@ -502,15 +568,18 @@
 	jpegWidth=(int)(*env)->GetIntField(env, obj, _fid);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
 	jpegHeight=(int)(*env)->GetIntField(env, obj, _fid);
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
-		_throw("Destination buffer is not large enough");
 
+	yuvSize=(jsize)tjBufSizeYUV2(desiredWidth==0? jpegWidth:desiredWidth,
+		pad, desiredHeight==0? jpegHeight:desiredHeight, jpegSubsamp);
+	if(yuvSize==(unsigned long)-1)
+		_throw(tjGetErrorStr());
+	if((*env)->GetArrayLength(env, dst)<yuvSize)
+		_throw("Destination buffer is not large enough");
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
-		flags)==-1)
+	if(tjDecompressToYUV2(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
+		desiredWidth, pad, desiredHeight, flags)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
 		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
@@ -524,6 +593,14 @@
 	return;
 }
 
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI
+	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+		jint flags)
+{
+	Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BIIII(
+		env, obj, src, jpegSize, dst, 0, 4, 0, flags);
+}
+
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_init
 	(JNIEnv *env, jobject obj)
 {
diff --git a/turbojpeg-mapfile b/turbojpeg-mapfile
index bd1ac71..7d174ca 100755
--- a/turbojpeg-mapfile
+++ b/turbojpeg-mapfile
@@ -36,3 +36,14 @@
 		tjInitTransform;
 		tjTransform;
 } TURBOJPEG_1.1;
+
+TURBOJPEG_1.4
+{
+	global:
+		tjBufSizeYUV2;
+		tjCompressFromYUV;
+		tjDecodeYUV;
+		tjDecompressHeader3;
+		tjDecompressToYUV2;
+		tjEncodeYUV3;
+} TURBOJPEG_1.2;
diff --git a/turbojpeg-mapfile.jni b/turbojpeg-mapfile.jni
index ca39c9e..a1be1fd 100755
--- a/turbojpeg-mapfile.jni
+++ b/turbojpeg-mapfile.jni
@@ -36,7 +36,7 @@
 		tjInitTransform;
 		tjTransform;
 		Java_org_libjpegturbo_turbojpeg_TJ_bufSize;
-		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III;
 		Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors;
 		Java_org_libjpegturbo_turbojpeg_TJCompressor_init;
 		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIII_3BIII;
@@ -48,7 +48,7 @@
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader;
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII;
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI;
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy;		
 		Java_org_libjpegturbo_turbojpeg_TJTransformer_init;
 		Java_org_libjpegturbo_turbojpeg_TJTransformer_transform;
@@ -62,3 +62,19 @@
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII;
 		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII;
 } TURBOJPEG_1.2;
+
+TURBOJPEG_1.4
+{
+	global:
+		tjBufSizeYUV2;
+		tjCompressFromYUV;
+		tjDecodeYUV;
+		tjDecompressHeader3;
+		tjDecompressToYUV2;
+		tjEncodeYUV3;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3BIIII_3BII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BIII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BIII;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BIIII;
+} TURBOJPEG_1.3;
diff --git a/turbojpeg.c b/turbojpeg.c
index 9117273..5aee2e8 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2012, 2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2014 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,12 +39,14 @@
 #include "./turbojpeg.h"
 #include "./tjutil.h"
 #include "transupp.h"
+#include "./jpegcomp.h"
 
 extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **,
 	unsigned long *, boolean);
 extern void jpeg_mem_src_tj(j_decompress_ptr, unsigned char *, unsigned long);
 
 #define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
+#define isPow2(x) (((x)&(x-1))==0)
 
 
 /* Error handling (based on example in example.c) */
@@ -85,7 +87,7 @@
 	int init;
 } tjinstance;
 
-static const int pixelsize[TJ_NUMSAMP]={3, 3, 3, 1, 3};
+static const int pixelsize[TJ_NUMSAMP]={3, 3, 3, 1, 3, 3};
 
 static const JXFORM_CODE xformtypes[TJ_NUMXOP]=
 {
@@ -185,6 +187,8 @@
 			cinfo->in_color_space=JCS_RGB;  pixelFormat=TJPF_RGB;
 			break;
 		#endif
+		case TJPF_CMYK:
+			cinfo->in_color_space=JCS_CMYK;  break;
 	}
 
 	cinfo->input_components=tjPixelSize[pixelFormat];
@@ -197,15 +201,20 @@
 	}
 	if(subsamp==TJSAMP_GRAY)
 		jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
-	else
-		jpeg_set_colorspace(cinfo, JCS_YCbCr);
+	else if(pixelFormat==TJPF_CMYK)
+		jpeg_set_colorspace(cinfo, JCS_YCCK);
+	else jpeg_set_colorspace(cinfo, JCS_YCbCr);
 
 	cinfo->comp_info[0].h_samp_factor=tjMCUWidth[subsamp]/8;
 	cinfo->comp_info[1].h_samp_factor=1;
 	cinfo->comp_info[2].h_samp_factor=1;
+	if(cinfo->num_components>3)
+		cinfo->comp_info[3].h_samp_factor=tjMCUWidth[subsamp]/8;
 	cinfo->comp_info[0].v_samp_factor=tjMCUHeight[subsamp]/8;
 	cinfo->comp_info[1].v_samp_factor=1;
 	cinfo->comp_info[2].v_samp_factor=1;
+	if(cinfo->num_components>3)
+		cinfo->comp_info[3].v_samp_factor=tjMCUHeight[subsamp]/8;
 
 	return retval;
 }
@@ -255,6 +264,8 @@
 		case TJPF_ABGR:
 			dinfo->out_color_space=JCS_RGB;  break;
 		#endif
+		case TJPF_CMYK:
+			dinfo->out_color_space=JCS_CMYK;  break;
 		default:
 			_throw("Unsupported pixel format");
 	}
@@ -271,7 +282,10 @@
 	int retval=-1, i, k;
 	for(i=0; i<NUMSUBOPT; i++)
 	{
-		if(dinfo->num_components==pixelsize[i])
+		if(dinfo->num_components==pixelsize[i]
+			|| ((dinfo->jpeg_color_space==JCS_YCCK
+				|| dinfo->jpeg_color_space==JCS_CMYK)
+					&& pixelsize[i]==3 && dinfo->num_components==4))
 		{
 			if(dinfo->comp_info[0].h_samp_factor==tjMCUWidth[i]/8
 				&& dinfo->comp_info[0].v_samp_factor==tjMCUHeight[i]/8)
@@ -279,8 +293,13 @@
 				int match=0;
 				for(k=1; k<dinfo->num_components; k++)
 				{
-					if(dinfo->comp_info[k].h_samp_factor==1
-						&& dinfo->comp_info[k].v_samp_factor==1)
+					int href=1, vref=1;
+					if(dinfo->jpeg_color_space==JCS_YCCK && k==3)
+					{
+						href=tjMCUWidth[i]/8;  vref=tjMCUHeight[i]/8;
+					}
+					if(dinfo->comp_info[k].h_samp_factor==href
+						&& dinfo->comp_info[k].v_samp_factor==vref)
 						match++;
 				}
 				if(match==dinfo->num_components-1)
@@ -547,22 +566,28 @@
 }
 
 
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
 	int subsamp)
 {
 	unsigned long retval=0;
 	int pw, ph, cw, ch;
-	if(width<1 || height<1 || subsamp<0 || subsamp>=NUMSUBOPT)
-		_throw("tjBufSizeYUV(): Invalid argument");
+	if(width<1 || height<1 || pad<1 || !isPow2(pad) || subsamp<0
+		|| subsamp>=NUMSUBOPT)
+		_throw("tjBufSizeYUV2(): Invalid argument");
 	pw=PAD(width, tjMCUWidth[subsamp]/8);
 	ph=PAD(height, tjMCUHeight[subsamp]/8);
 	cw=pw*8/tjMCUWidth[subsamp];  ch=ph*8/tjMCUHeight[subsamp];
-	retval=PAD(pw, 4)*ph + (subsamp==TJSAMP_GRAY? 0:PAD(cw, 4)*ch*2);
+	retval=PAD(pw, pad)*ph + (subsamp==TJSAMP_GRAY? 0:PAD(cw, pad)*ch*2);
 
 	bailout:
 	return retval;
 }
 
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+	int subsamp)
+{
+	return tjBufSizeYUV2(width, 4, height, subsamp);
+}
 
 DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
 	int subsamp)
@@ -669,9 +694,9 @@
 }
 
 
-DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf,
+DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle, unsigned char *srcBuf,
 	int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf,
-	int subsamp, int flags)
+	int pad, int subsamp, int flags)
 {
 	int i, retval=0;  JSAMPROW *row_pointer=NULL;
 	JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
@@ -694,12 +719,12 @@
 	}
 
 	if((this->init&COMPRESS)==0)
-		_throw("tjEncodeYUV2(): Instance has not been initialized for compression");
+		_throw("tjEncodeYUV3(): Instance has not been initialized for compression");
 
 	if(srcBuf==NULL || width<=0 || pitch<0 || height<=0 || pixelFormat<0
-		|| pixelFormat>=TJ_NUMPF || dstBuf==NULL || subsamp<0
-		|| subsamp>=NUMSUBOPT)
-		_throw("tjEncodeYUV2(): Invalid argument");
+		|| pixelFormat>=TJ_NUMPF || dstBuf==NULL || pad<0 || !isPow2(pad)
+		|| subsamp<0 || subsamp>=NUMSUBOPT)
+		_throw("tjEncodeYUV3(): Invalid argument");
 
 	if(setjmp(this->jerr.setjmp_buffer))
 	{
@@ -708,13 +733,16 @@
 		goto bailout;
 	}
 
+	if(pixelFormat==TJPF_CMYK)
+		_throw("tjEncodeYUV3(): Cannot generate YUV images from CMYK pixels");
+
 	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
 
 	#ifndef JCS_EXTENSIONS
 	if(pixelFormat!=TJPF_GRAY)
 	{
 		rgbBuf=(unsigned char *)malloc(width*height*RGB_PIXELSIZE);
-		if(!rgbBuf) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!rgbBuf) _throw("tjEncodeYUV3(): Memory allocation failure");
 		srcBuf=toRGB(srcBuf, width, pitch, height, pixelFormat, rgbBuf);
 		pitch=width*RGB_PIXELSIZE;
 	}
@@ -727,7 +755,7 @@
 	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
 	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	yuvsize=tjBufSizeYUV(width, height, subsamp);
+	yuvsize=tjBufSizeYUV2(width, pad, height, subsamp);
 	if(setCompDefaults(cinfo, pixelFormat, subsamp, -1, flags)==-1) return -1;
 
 	/* Execute only the parts of jpeg_start_compress() that we need.  If we
@@ -746,7 +774,7 @@
 	ph=PAD(height, cinfo->max_v_samp_factor);
 
 	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph))==NULL)
-		_throw("tjEncodeYUV2(): Memory allocation failure");
+		_throw("tjEncodeYUV3(): Memory allocation failure");
 	for(i=0; i<height; i++)
 	{
 		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&srcBuf[(height-i-1)*pitch];
@@ -761,9 +789,9 @@
 		_tmpbuf[i]=(JSAMPLE *)malloc(
 			PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
 				/compptr->h_samp_factor, 16) * cinfo->max_v_samp_factor + 16);
-		if(!_tmpbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!_tmpbuf[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*cinfo->max_v_samp_factor);
-		if(!tmpbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!tmpbuf[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		for(row=0; row<cinfo->max_v_samp_factor; row++)
 		{
 			unsigned char *_tmpbuf_aligned=
@@ -774,9 +802,9 @@
 		}
 		_tmpbuf2[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
 			* compptr->v_samp_factor + 16);
-		if(!_tmpbuf2[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!_tmpbuf2[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		tmpbuf2[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
-		if(!tmpbuf2[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!tmpbuf2[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		for(row=0; row<compptr->v_samp_factor; row++)
 		{
 			unsigned char *_tmpbuf2_aligned=
@@ -787,15 +815,15 @@
 		cw[i]=pw*compptr->h_samp_factor/cinfo->max_h_samp_factor;
 		ch[i]=ph*compptr->v_samp_factor/cinfo->max_v_samp_factor;
 		outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ch[i]);
-		if(!outbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!outbuf[i]) _throw("tjEncodeYUV3(): Memory allocation failure");
 		for(row=0; row<ch[i]; row++)
 		{
 			outbuf[i][row]=ptr;
-			ptr+=PAD(cw[i], 4);
+			ptr+=PAD(cw[i], pad);
 		}
 	}
 	if(yuvsize!=(unsigned long)(ptr-dstBuf))
-		_throw("tjEncodeYUV2(): Generated image is not the correct size");
+		_throw("tjEncodeYUV3(): Generated image is not the correct size");
 
 	for(row=0; row<ph; row+=cinfo->max_v_samp_factor)
 	{
@@ -827,6 +855,14 @@
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf,
+	int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf,
+	int subsamp, int flags)
+{
+	return tjEncodeYUV3(handle, srcBuf, width, pitch, height, pixelFormat,
+		dstBuf, 4, subsamp, flags);
+}
+
 DLLEXPORT int DLLCALL tjEncodeYUV(tjhandle handle, unsigned char *srcBuf,
 	int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
 	int subsamp, int flags)
@@ -836,6 +872,134 @@
 }
 
 
+DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle, unsigned char *srcBuf,
+	int width, int pad, int height, int subsamp, unsigned char **jpegBuf,
+	unsigned long *jpegSize, int jpegQual, int flags)
+{
+	int i, row, retval=0, alloc=1;  JSAMPROW *inbuf[MAX_COMPONENTS];
+	int cw[MAX_COMPONENTS], ch[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
+	JSAMPLE *_tmpbuf=NULL, *ptr=srcBuf;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
+
+	getinstance(handle)
+
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		tmpbuf[i]=NULL;  inbuf[i]=NULL;
+	}
+
+	if((this->init&COMPRESS)==0)
+		_throw("tjCompressFromYUV(): Instance has not been initialized for compression");
+
+	if(srcBuf==NULL || width<=0 || pad<1 || height<=0 || subsamp<0
+		|| subsamp>=NUMSUBOPT || jpegBuf==NULL || jpegSize==NULL || jpegQual<0
+		|| jpegQual>100)
+		_throw("tjCompressFromYUV(): Invalid argument");
+
+	if(setjmp(this->jerr.setjmp_buffer))
+	{
+		/* If we get here, the JPEG code has signaled an error. */
+		retval=-1;
+		goto bailout;
+	}
+
+	cinfo->image_width=width;
+	cinfo->image_height=height;
+
+	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+
+	if(flags&TJFLAG_NOREALLOC)
+	{
+		alloc=0;  *jpegSize=tjBufSize(width, height, subsamp);
+	}
+	jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
+	if(setCompDefaults(cinfo, TJPF_RGB, subsamp, jpegQual, flags)==-1)
+		return -1;
+	cinfo->raw_data_in=TRUE;
+
+	jpeg_start_compress(cinfo, TRUE);
+	for(i=0; i<cinfo->num_components; i++)
+	{
+		jpeg_component_info *compptr=&cinfo->comp_info[i];
+		int ih;
+		iw[i]=compptr->width_in_blocks*DCTSIZE;
+		ih=compptr->height_in_blocks*DCTSIZE;
+		cw[i]=PAD(cinfo->image_width, cinfo->max_h_samp_factor)
+			*compptr->h_samp_factor/cinfo->max_h_samp_factor;
+		ch[i]=PAD(cinfo->image_height, cinfo->max_v_samp_factor)
+			*compptr->v_samp_factor/cinfo->max_v_samp_factor;
+		if(iw[i]!=cw[i] || ih!=ch[i]) usetmpbuf=1;
+		th[i]=compptr->v_samp_factor*DCTSIZE;
+		tmpbufsize+=iw[i]*th[i];
+		if((inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ch[i]))==NULL)
+			_throw("tjCompressFromYUV(): Memory allocation failure");
+		for(row=0; row<ch[i]; row++)
+		{
+			inbuf[i][row]=ptr;
+			ptr+=PAD(cw[i], pad);
+		}
+	}
+	if(usetmpbuf)
+	{
+		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
+			_throw("tjCompressFromYUV(): Memory allocation failure");
+		ptr=_tmpbuf;
+		for(i=0; i<cinfo->num_components; i++)
+		{
+			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
+				_throw("tjCompressFromYUV(): Memory allocation failure");
+			for(row=0; row<th[i]; row++)
+			{
+				tmpbuf[i][row]=ptr;
+				ptr+=iw[i];
+			}
+		}
+	}
+
+	for(row=0; row<(int)cinfo->image_height;
+		row+=cinfo->max_v_samp_factor*DCTSIZE)
+	{
+		JSAMPARRAY yuvptr[MAX_COMPONENTS];
+		int crow[MAX_COMPONENTS];
+		for(i=0; i<cinfo->num_components; i++)
+		{
+			jpeg_component_info *compptr=&cinfo->comp_info[i];
+			crow[i]=row*compptr->v_samp_factor/cinfo->max_v_samp_factor;
+			if(usetmpbuf)
+			{
+				int j, k;
+				for(j=0; j<min(th[i], ch[i]-crow[i]); j++)
+				{
+					memcpy(tmpbuf[i][j], inbuf[i][crow[i]+j], cw[i]);
+					/* Duplicate last sample in row to fill out MCU */
+					for(k=cw[i]; k<iw[i]; k++) tmpbuf[i][j][k]=tmpbuf[i][j][cw[i]-1];
+				}
+				/* Duplicate last row to fill out MCU */
+				for(j=ch[i]-crow[i]; j<th[i]; j++)
+					memcpy(tmpbuf[i][j], tmpbuf[i][ch[i]-crow[i]-1], iw[i]);
+				yuvptr[i]=tmpbuf[i];
+			}
+			else
+				yuvptr[i]=&inbuf[i][crow[i]];
+		}
+		jpeg_write_raw_data(cinfo, yuvptr, cinfo->max_v_samp_factor*DCTSIZE);
+	}
+	jpeg_finish_compress(cinfo);
+
+	bailout:
+	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		if(tmpbuf[i]) free(tmpbuf[i]);
+		if(inbuf[i]) free(inbuf[i]);
+	}
+	if(_tmpbuf) free(_tmpbuf);
+	return retval;
+}
+
+
 /* Decompressor */
 
 static tjhandle _tjInitDecompress(tjinstance *this)
@@ -875,19 +1039,19 @@
 }
 
 
-DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
 	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-	int *jpegSubsamp)
+	int *jpegSubsamp, int *jpegColorspace)
 {
 	int retval=0;
 
 	getinstance(handle);
 	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressHeader2(): Instance has not been initialized for decompression");
+		_throw("tjDecompressHeader3(): Instance has not been initialized for decompression");
 
 	if(jpegBuf==NULL || jpegSize<=0 || width==NULL || height==NULL
-		|| jpegSubsamp==NULL)
-		_throw("tjDecompressHeader2(): Invalid argument");
+		|| jpegSubsamp==NULL || jpegColorspace==NULL)
+		_throw("tjDecompressHeader3(): Invalid argument");
 
 	if(setjmp(this->jerr.setjmp_buffer))
 	{
@@ -901,18 +1065,38 @@
 	*width=dinfo->image_width;
 	*height=dinfo->image_height;
 	*jpegSubsamp=getSubsamp(dinfo);
+	switch(dinfo->jpeg_color_space)
+	{
+		case JCS_GRAYSCALE:  *jpegColorspace=TJCS_GRAY;  break;
+		case JCS_RGB:        *jpegColorspace=TJCS_RGB;  break;
+		case JCS_YCbCr:      *jpegColorspace=TJCS_YCbCr;  break;
+		case JCS_CMYK:       *jpegColorspace=TJCS_CMYK;  break;
+		case JCS_YCCK:       *jpegColorspace=TJCS_YCCK;  break;
+		default:             *jpegColorspace=-1;  break;
+	}
 
 	jpeg_abort_decompress(dinfo);
 
 	if(*jpegSubsamp<0)
-		_throw("tjDecompressHeader2(): Could not determine subsampling type for JPEG image");
+		_throw("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
+	if(*jpegColorspace<0)
+		_throw("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
 	if(*width<1 || *height<1)
-		_throw("tjDecompressHeader2(): Invalid data returned in header");
+		_throw("tjDecompressHeader3(): Invalid data returned in header");
 
 	bailout:
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
+	int *jpegSubsamp)
+{
+	int jpegColorspace;
+	return tjDecompressHeader3(handle, jpegBuf, jpegSize, width, height,
+		jpegSubsamp, &jpegColorspace);
+}
+
 DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
 	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height)
 {
@@ -1049,14 +1233,224 @@
 }
 
 
-DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-	int flags)
+static int setDecodeDefaults(struct jpeg_decompress_struct *dinfo,
+	int pixelFormat, int subsamp, int flags)
 {
-	int i, row, retval=0;  JSAMPROW *outbuf[MAX_COMPONENTS];
+	int i;
+
+	dinfo->scale_num=dinfo->scale_denom=1;
+
+	if(subsamp==TJSAMP_GRAY)
+	{
+		dinfo->num_components=dinfo->comps_in_scan=1;
+		dinfo->jpeg_color_space=JCS_GRAYSCALE;
+	}
+	else
+	{
+		dinfo->num_components=dinfo->comps_in_scan=3;
+		dinfo->jpeg_color_space=JCS_YCbCr;
+	}
+
+	dinfo->comp_info=(jpeg_component_info *)
+		(*dinfo->mem->alloc_small)((j_common_ptr)dinfo, JPOOL_IMAGE,
+			dinfo->num_components*SIZEOF(jpeg_component_info));
+
+	for(i=0; i<dinfo->num_components; i++)
+	{
+		jpeg_component_info *compptr=&dinfo->comp_info[i];
+		compptr->h_samp_factor=(i==0)? tjMCUWidth[subsamp]/8:1;
+		compptr->v_samp_factor=(i==0)? tjMCUHeight[subsamp]/8:1;
+		compptr->component_index=i;
+		compptr->component_id=i+1;
+		compptr->quant_tbl_no=compptr->dc_tbl_no=compptr->ac_tbl_no=
+			(i==0)? 0:1;
+		dinfo->cur_comp_info[i]=compptr;
+	}
+	dinfo->data_precision=8;
+	for(i=0; i<2; i++)
+	{
+		if(dinfo->quant_tbl_ptrs[i]==NULL)
+			dinfo->quant_tbl_ptrs[i]=jpeg_alloc_quant_table((j_common_ptr)dinfo);
+	}
+
+	return 0;
+}
+
+
+int my_read_markers(j_decompress_ptr dinfo)
+{
+	return JPEG_REACHED_SOS;
+}
+
+void my_reset_marker_reader(j_decompress_ptr dinfo)
+{
+}
+
+DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, unsigned char *srcBuf,
+	int pad, int subsamp, unsigned char *dstBuf, int width, int pitch,
+	int height, int pixelFormat, int flags)
+{
+	int i, retval=0;  JSAMPROW *row_pointer=NULL;
+	JSAMPLE *_tmpbuf[MAX_COMPONENTS];
+	JSAMPROW *tmpbuf[MAX_COMPONENTS], *inbuf[MAX_COMPONENTS];
+	int row, pw, ph, cw[MAX_COMPONENTS], ch[MAX_COMPONENTS];
+	JSAMPLE *ptr=srcBuf;
+	unsigned long yuvsize=0;
+	jpeg_component_info *compptr;
+	#ifndef JCS_EXTENSIONS
+	unsigned char *rgbBuf=NULL;
+	#endif
+	JMETHOD(int, old_read_markers, (j_decompress_ptr));
+	JMETHOD(void, old_reset_marker_reader, (j_decompress_ptr));
+
+	getinstance(handle);
+
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		tmpbuf[i]=NULL;  _tmpbuf[i]=NULL;  inbuf[i]=NULL;
+	}
+
+	if((this->init&DECOMPRESS)==0)
+		_throw("tjDecodeYUV(): Instance has not been initialized for compression");
+
+	if(srcBuf==NULL || pad<0 || !isPow2(pad) || subsamp<0 || subsamp>=NUMSUBOPT
+		|| dstBuf==NULL || width<=0 || pitch<0 || height<=0 || pixelFormat<0
+		|| pixelFormat>=TJ_NUMPF)
+		_throw("tjDecodeYUV(): Invalid argument");
+
+	if(setjmp(this->jerr.setjmp_buffer))
+	{
+		/* If we get here, the JPEG code has signaled an error. */
+		retval=-1;
+		goto bailout;
+	}
+
+	if(pixelFormat==TJPF_CMYK)
+		_throw("tjDecodeYUV(): Cannot decode YUV images into CMYK pixels.");
+
+	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
+
+	#ifndef JCS_EXTENSIONS
+	if(pixelFormat!=TJPF_GRAY)
+	{
+		rgbBuf=(unsigned char *)malloc(width*height*RGB_PIXELSIZE);
+		if(!rgbBuf) _throw("tjDecodeYUV(): Memory allocation failure");
+		srcBuf=toRGB(srcBuf, width, pitch, height, pixelFormat, rgbBuf);
+		pitch=width*RGB_PIXELSIZE;
+	}
+	#endif
+
+	dinfo->image_width=width;
+	dinfo->image_height=height;
+
+	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+
+	yuvsize=tjBufSizeYUV2(width, pad, height, subsamp);
+	if(setDecodeDefaults(dinfo, pixelFormat, subsamp, flags)==-1)
+	{
+		retval=-1;  goto bailout;
+	}
+	old_read_markers=dinfo->marker->read_markers;
+	dinfo->marker->read_markers=my_read_markers;
+	old_reset_marker_reader=dinfo->marker->reset_marker_reader;
+	dinfo->marker->reset_marker_reader=my_reset_marker_reader;
+	jpeg_read_header(dinfo, TRUE);
+	dinfo->marker->read_markers=old_read_markers;
+	dinfo->marker->reset_marker_reader=old_reset_marker_reader;
+
+	if(setDecompDefaults(dinfo, pixelFormat, flags)==-1)
+	{
+		retval=-1;  goto bailout;
+	}
+	dinfo->do_fancy_upsampling=FALSE;
+	jinit_master_decompress(dinfo);
+	(*dinfo->upsample->start_pass)(dinfo);
+
+	pw=PAD(width, dinfo->max_h_samp_factor);
+	ph=PAD(height, dinfo->max_v_samp_factor);
+
+	if(pitch==0) pitch=dinfo->output_width*tjPixelSize[pixelFormat];
+
+	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph))==NULL)
+		_throw("tjDecodeYUV(): Memory allocation failure");
+	for(i=0; i<height; i++)
+	{
+		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&dstBuf[(height-i-1)*pitch];
+		else row_pointer[i]=&dstBuf[i*pitch];
+	}
+	if(height<ph)
+		for(i=height; i<ph; i++) row_pointer[i]=row_pointer[height-1];
+
+	for(i=0; i<dinfo->num_components; i++)
+	{
+		compptr=&dinfo->comp_info[i];
+		_tmpbuf[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
+			* compptr->v_samp_factor + 16);
+		if(!_tmpbuf[i]) _throw("tjDecodeYUV(): Memory allocation failure");
+		tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
+		if(!tmpbuf[i]) _throw("tjDecodeYUV(): Memory allocation failure");
+		for(row=0; row<compptr->v_samp_factor; row++)
+		{
+			unsigned char *_tmpbuf_aligned=
+				(unsigned char *)PAD((size_t)_tmpbuf[i], 16);
+			tmpbuf[i][row]=&_tmpbuf_aligned[
+				PAD(compptr->width_in_blocks*DCTSIZE, 16) * row];
+		}
+		cw[i]=pw*compptr->h_samp_factor/dinfo->max_h_samp_factor;
+		ch[i]=ph*compptr->v_samp_factor/dinfo->max_v_samp_factor;
+		inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ch[i]);
+		if(!inbuf[i]) _throw("tjDecodeYUV(): Memory allocation failure");
+		for(row=0; row<ch[i]; row++)
+		{
+			inbuf[i][row]=ptr;
+			ptr+=PAD(cw[i], pad);
+		}
+	}
+
+	if(yuvsize!=(unsigned long)(ptr-srcBuf))
+		_throw("tjDecodeYUV(): YUV image is not the correct size");
+
+	for(row=0; row<ph; row+=dinfo->max_v_samp_factor)
+	{
+		JDIMENSION inrow=0, outrow=0;
+		for(i=0, compptr=dinfo->comp_info; i<dinfo->num_components; i++, compptr++)
+			jcopy_sample_rows(inbuf[i],
+				row*compptr->v_samp_factor/dinfo->max_v_samp_factor, tmpbuf[i], 0,
+				compptr->v_samp_factor, cw[i]);
+		(dinfo->upsample->upsample)(dinfo, tmpbuf, &inrow,
+			dinfo->max_v_samp_factor, &row_pointer[row], &outrow,
+			dinfo->max_v_samp_factor);
+	}
+	jpeg_abort_decompress(dinfo);
+
+	bailout:
+	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
+	#ifndef JCS_EXTENSIONS
+	if(rgbBuf) free(rgbBuf);
+	#endif
+	if(row_pointer) free(row_pointer);
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		if(tmpbuf[i]!=NULL) free(tmpbuf[i]);
+		if(_tmpbuf[i]!=NULL) free(_tmpbuf[i]);
+		if(inbuf[i]!=NULL) free(inbuf[i]);
+	}
+	return retval;
+}
+
+
+DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
+	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+	int width, int pad, int height, int flags)
+{
+	int i, sfi, row, retval=0;  JSAMPROW *outbuf[MAX_COMPONENTS];
+	int jpegwidth, jpegheight, jpegSubsamp, scaledw, scaledh;
 	int cw[MAX_COMPONENTS], ch[MAX_COMPONENTS], iw[MAX_COMPONENTS],
 		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
 	JSAMPLE *_tmpbuf=NULL, *ptr=dstBuf;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
+	int dctsize;
 
 	getinstance(handle);
 
@@ -1066,10 +1460,11 @@
 	}
 
 	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressToYUV(): Instance has not been initialized for decompression");
+		_throw("tjDecompressToYUV2(): Instance has not been initialized for decompression");
 
-	if(jpegBuf==NULL || jpegSize<=0 || dstBuf==NULL)
-		_throw("tjDecompressToYUV(): Invalid argument");
+	if(jpegBuf==NULL || jpegSize<=0 || dstBuf==NULL || width<0 || pad<1
+		|| !isPow2(pad) || height<0)
+		_throw("tjDecompressToYUV2(): Invalid argument");
 
 	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
 	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
@@ -1084,37 +1479,63 @@
 
 	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
 	jpeg_read_header(dinfo, TRUE);
+	jpegSubsamp=getSubsamp(dinfo);
+	if(jpegSubsamp<0)
+		_throw("tjDecompressToYUV2(): Could not determine subsampling type for JPEG image");
+
+	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
+	if(width==0) width=jpegwidth;
+	if(height==0) height=jpegheight;
+	for(i=0; i<NUMSF; i++)
+	{
+		scaledw=TJSCALED(jpegwidth, sf[i]);
+		scaledh=TJSCALED(jpegheight, sf[i]);
+		if(scaledw<=width && scaledh<=height)
+			break;
+	}
+	if(scaledw>width || scaledh>height)
+		_throw("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
+	if(dinfo->num_components>3)
+		_throw("tjDecompressToYUV2(): JPEG image must have 3 or fewer components");
+
+	width=scaledw;  height=scaledh;
+	dinfo->scale_num=sf[i].num;
+	dinfo->scale_denom=sf[i].denom;
+	sfi=i;
+	jpeg_calc_output_dimensions(dinfo);
+
+	dctsize=DCTSIZE*sf[sfi].num/sf[sfi].denom;
 
 	for(i=0; i<dinfo->num_components; i++)
 	{
 		jpeg_component_info *compptr=&dinfo->comp_info[i];
 		int ih;
-		iw[i]=compptr->width_in_blocks*DCTSIZE;
-		ih=compptr->height_in_blocks*DCTSIZE;
-		cw[i]=PAD(dinfo->image_width, dinfo->max_h_samp_factor)
+		iw[i]=compptr->width_in_blocks*dctsize;
+		ih=compptr->height_in_blocks*dctsize;
+		cw[i]=PAD(dinfo->output_width, dinfo->max_h_samp_factor)
 			*compptr->h_samp_factor/dinfo->max_h_samp_factor;
-		ch[i]=PAD(dinfo->image_height, dinfo->max_v_samp_factor)
+		ch[i]=PAD(dinfo->output_height, dinfo->max_v_samp_factor)
 			*compptr->v_samp_factor/dinfo->max_v_samp_factor;
 		if(iw[i]!=cw[i] || ih!=ch[i]) usetmpbuf=1;
-		th[i]=compptr->v_samp_factor*DCTSIZE;
+		th[i]=compptr->v_samp_factor*dctsize;
 		tmpbufsize+=iw[i]*th[i];
 		if((outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ch[i]))==NULL)
-			_throw("tjDecompressToYUV(): Memory allocation failure");
+			_throw("tjDecompressToYUV2(): Memory allocation failure");
 		for(row=0; row<ch[i]; row++)
 		{
 			outbuf[i][row]=ptr;
-			ptr+=PAD(cw[i], 4);
+			ptr+=PAD(cw[i], pad);
 		}
 	}
 	if(usetmpbuf)
 	{
 		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
-			_throw("tjDecompressToYUV(): Memory allocation failure");
+			_throw("tjDecompressToYUV2(): Memory allocation failure");
 		ptr=_tmpbuf;
 		for(i=0; i<dinfo->num_components; i++)
 		{
 			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
-				_throw("tjDecompressToYUV(): Memory allocation failure");
+				_throw("tjDecompressToYUV2(): Memory allocation failure");
 			for(row=0; row<th[i]; row++)
 			{
 				tmpbuf[i][row]=ptr;
@@ -1129,18 +1550,37 @@
 
 	jpeg_start_decompress(dinfo);
 	for(row=0; row<(int)dinfo->output_height;
-		row+=dinfo->max_v_samp_factor*DCTSIZE)
+		row+=dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size)
 	{
 		JSAMPARRAY yuvptr[MAX_COMPONENTS];
 		int crow[MAX_COMPONENTS];
 		for(i=0; i<dinfo->num_components; i++)
 		{
 			jpeg_component_info *compptr=&dinfo->comp_info[i];
+			if(jpegSubsamp==TJ_420)
+			{
+				/* When 4:2:0 subsampling is used with IDCT scaling, libjpeg will try
+				   to be clever and use the IDCT to perform upsampling on the U and V
+				   planes.  For instance, if the output image is to be scaled by 1/2
+				   relative to the JPEG image, then the scaling factor and upsampling
+				   effectively cancel each other, so a normal 8x8 IDCT can be used.
+				   However, this is not desirable when using the decompress-to-YUV
+				   functionality in TurboJPEG, since we want to output the U and V
+				   planes in their subsampled form.  Thus, we have to override some
+				   internal libjpeg parameters to force it to use the "scaled" IDCT
+				   functions on the U and V planes. */
+				compptr->_DCT_scaled_size=dctsize;
+				compptr->MCU_sample_width=tjMCUWidth[jpegSubsamp]*
+					sf[sfi].num/sf[sfi].denom*
+					compptr->v_samp_factor/dinfo->max_v_samp_factor;
+				dinfo->idct->inverse_DCT[i] = dinfo->idct->inverse_DCT[0];
+			}
 			crow[i]=row*compptr->v_samp_factor/dinfo->max_v_samp_factor;
 			if(usetmpbuf) yuvptr[i]=tmpbuf[i];
 			else yuvptr[i]=&outbuf[i][crow[i]];
 		}
-		jpeg_read_raw_data(dinfo, yuvptr, dinfo->max_v_samp_factor*DCTSIZE);
+		jpeg_read_raw_data(dinfo, yuvptr,
+			dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size);
 		if(usetmpbuf)
 		{
 			int j;
@@ -1166,6 +1606,13 @@
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+	int flags)
+{
+	return tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, 0, 4, 0, flags);
+}
+
 
 /* Transformer */
 
diff --git a/turbojpeg.h b/turbojpeg.h
index c778556..a082fa3 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2013 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2014 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,16 +49,16 @@
 /**
  * The number of chrominance subsampling options
  */
-#define TJ_NUMSAMP 5
+#define TJ_NUMSAMP 6
 
 /**
  * Chrominance subsampling options.
- * When an image is converted from the RGB to the YCbCr colorspace as part of
- * the JPEG compression process, some of the Cb and Cr (chrominance) components
- * can be discarded or averaged together to produce a smaller image with little
- * perceptible loss of image clarity (the human eye is more sensitive to small
- * changes in brightness than small changes in color.)  This is called
- * "chrominance subsampling".
+ * When pixels are converted from RGB to YCbCr (see #TJCS_YCbCr) or from CMYK
+ * to YCCK (see #TJCS_YCCK) as part of the JPEG compression process, some of
+ * the Cb and Cr (chrominance) components can be discarded or averaged together
+ * to produce a smaller image with little perceptible loss of image clarity
+ * (the human eye is more sensitive to small changes in brightness than to
+ * small changes in color.)  This is called "chrominance subsampling".
  * <p>
  * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
  * convention of the digital video community, the TurboJPEG API uses "YUV" to
@@ -91,7 +91,18 @@
    * chrominance component for every 1x2 block of pixels in the source image.
    * Note that 4:4:0 subsampling is not fully accelerated in libjpeg-turbo.
    */
-  TJSAMP_440
+  TJSAMP_440,
+  /**
+   * 4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 4x1 block of pixels in the source image.
+   * JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+   * horizontal features.  Note that 4:1:1 subsampling is not fully accelerated
+   * in libjpeg-turbo.
+   */
+  TJSAMP_411
 };
 
 /**
@@ -100,9 +111,10 @@
  * - 8x8 for no subsampling or grayscale
  * - 16x8 for 4:2:2
  * - 8x16 for 4:4:0
- * - 16x16 for 4:2:0 
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
  */
-static const int tjMCUWidth[TJ_NUMSAMP]  = {8, 16, 16, 8, 8};
+static const int tjMCUWidth[TJ_NUMSAMP]  = {8, 16, 16, 8, 8, 32};
 
 /**
  * MCU block height (in pixels) for a given level of chrominance subsampling.
@@ -110,15 +122,16 @@
  * - 8x8 for no subsampling or grayscale
  * - 16x8 for 4:2:2
  * - 8x16 for 4:4:0
- * - 16x16 for 4:2:0 
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
  */
-static const int tjMCUHeight[TJ_NUMSAMP] = {8, 8, 16, 8, 16};
+static const int tjMCUHeight[TJ_NUMSAMP] = {8, 8, 16, 8, 16, 8};
 
 
 /**
  * The number of pixel formats
  */
-#define TJ_NUMPF 11
+#define TJ_NUMPF 12
 
 /**
  * Pixel formats
@@ -193,16 +206,33 @@
    * decompressing, the X component is guaranteed to be 0xFF, which can be
    * interpreted as an opaque alpha channel.
    */
-  TJPF_ARGB
+  TJPF_ARGB,
+  /**
+   * CMYK pixel format.  Unlike RGB, which is an additive color model used
+   * primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+   * color model used primarily for printing.  In the CMYK color model, the
+   * value of each color component typically corresponds to an amount of cyan,
+   * magenta, yellow, or black ink that is applied to a white background.  In
+   * order to convert between CMYK and RGB, it is necessary to use a color
+   * management system (CMS.)  A CMS will attempt to map colors within the
+   * printer's gamut to perceptually similar colors in the display's gamut and
+   * vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+   * be defined with a simple formula.  Thus, such a conversion is out of scope
+   * for a codec library.  However, the TurboJPEG API allows for compressing
+   * CMYK pixels into a YCCK JPEG image (see #TJCS_YCCK) and decompressing YCCK
+   * JPEG images into CMYK pixels.
+   */
+  TJPF_CMYK
 };
 
+
 /**
  * Red offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the red component is offset from the start of the pixel.  For
  * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
  * then the red component will be <tt>pixel[tjRedOffset[TJ_BGRX]]</tt>.
  */
-static const int tjRedOffset[TJ_NUMPF] = {0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1};
+static const int tjRedOffset[TJ_NUMPF] = {0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1, -1};
 /**
  * Green offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the green component is offset from the start of the pixel.
@@ -210,19 +240,81 @@
  * <tt>char pixel[]</tt>, then the green component will be
  * <tt>pixel[tjGreenOffset[TJ_BGRX]]</tt>.
  */
-static const int tjGreenOffset[TJ_NUMPF] = {1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2};
+static const int tjGreenOffset[TJ_NUMPF] = {1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2, -1};
 /**
  * Blue offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the Blue component is offset from the start of the pixel.  For
  * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
  * then the blue component will be <tt>pixel[tjBlueOffset[TJ_BGRX]]</tt>.
  */
-static const int tjBlueOffset[TJ_NUMPF] = {2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3};
+static const int tjBlueOffset[TJ_NUMPF] = {2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3, -1};
 
 /**
  * Pixel size (in bytes) for a given pixel format.
  */
-static const int tjPixelSize[TJ_NUMPF] = {3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4};
+static const int tjPixelSize[TJ_NUMPF] = {3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4};
+
+
+/**
+ * The number of JPEG colorspaces
+ */
+#define TJ_NUMCS 5
+
+/**
+ * JPEG colorspaces
+ */
+enum TJCS
+{
+  /**
+   * RGB colorspace.  When compressing the JPEG image, the R, G, and B
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  RGB JPEG images can be
+   * decompressed to any of the extended RGB pixel formats or grayscale, but
+   * they cannot be decompressed to YUV images.
+   */
+  TJCS_RGB=0,
+  /**
+   * YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+   * mathematical transformation of RGB designed solely for storage and
+   * transmission.  YCbCr images must be converted to RGB before they can
+   * actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+   * component represents the black & white portion of the original image, and
+   * the Cb and Cr (chrominance) components represent the color portion of the
+   * original image.  Originally, the analog equivalent of this transformation
+   * allowed the same signal to drive both black & white and color televisions,
+   * but JPEG images use YCbCr primarily because it allows the color data to be
+   * optionally subsampled for the purposes of reducing bandwidth or disk
+   * space.  YCbCr is the most common JPEG colorspace, and YCbCr JPEG images
+   * can be compressed from and decompressed to any of the extended RGB pixel
+   * formats or grayscale, or they can be decompressed to YUV planar images.
+   */
+  TJCS_YCbCr,
+  /**
+   * Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+   * component), and any color data from the source image is discarded.
+   * Grayscale JPEG images can be compressed from and decompressed to any of
+   * the extended RGB pixel formats or grayscale, or they can be decompressed
+   * to YUV planar images.
+   */
+  TJCS_GRAY,
+  /**
+   * CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  CMYK JPEG images can
+   * only be decompressed to CMYK pixels.
+   */
+  TJCS_CMYK,
+  /**
+   * YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+   * rather a mathematical transformation of CMYK designed solely for storage
+   * and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+   * reversibly transformed into YCCK, and as with YCbCr, the chrominance
+   * components in the YCCK pixels can be subsampled without incurring major
+   * perceptual loss.  YCCK JPEG images can only be compressed from and
+   * decompressed to CMYK pixels.
+   */
+  TJCS_YCCK
+};
 
 
 /**
@@ -231,26 +323,6 @@
  */
 #define TJFLAG_BOTTOMUP        2
 /**
- * Turn off CPU auto-detection and force TurboJPEG to use MMX code (if the
- * underlying codec supports it.)
- */
-#define TJFLAG_FORCEMMX        8
-/**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE code (if the
- * underlying codec supports it.)
- */
-#define TJFLAG_FORCESSE       16
-/**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE2 code (if the
- * underlying codec supports it.)
- */
-#define TJFLAG_FORCESSE2      32
-/**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE3 code (if the
- * underlying codec supports it.)
- */
-#define TJFLAG_FORCESSE3     128
-/**
  * When decompressing an image that was compressed using chrominance
  * subsampling, use the fastest chrominance upsampling algorithm available in
  * the underlying codec.  The default is to use smooth upsampling, which
@@ -445,8 +517,8 @@
   /**
    * A callback function that can be used to modify the DCT coefficients
    * after they are losslessly transformed but before they are transcoded to a
-   * new JPEG file.  This allows for custom filters or other transformations to
-   * be applied in the frequency domain.
+   * new JPEG image.  This allows for custom filters or other transformations
+   * to be applied in the frequency domain.
    *
    * @param coeffs pointer to an array of transformed DCT coefficients.  (NOTE:
    *        this pointer is not guaranteed to be valid once the callback
@@ -512,11 +584,11 @@
 
 
 /**
- * Compress an RGB or grayscale image into a JPEG image.
+ * Compress an RGB, grayscale, or CMYK image into a JPEG image.
  *
  * @param handle a handle to a TurboJPEG compressor or transformer instance
- * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
- *        to be compressed
+ * @param srcBuf pointer to an image buffer containing RGB, grayscale, or
+ *        CMYK pixels to be compressed
  * @param width width (in pixels) of the source image
  * @param pitch bytes per line of the source image.  Normally, this should be
  *        <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded,
@@ -564,6 +636,60 @@
 
 
 /**
+ * Compress a YUV planar image into a JPEG image.
+ *
+ * @param handle a handle to a TurboJPEG compressor or transformer instance
+ * @param srcBuf pointer to an image buffer containing a YUV planar image
+ *        to be compressed.  The Y, U (Cb), and V (Cr) image planes should be
+ *        stored sequentially in the buffer, and the size of each plane
+ *        is determined by the specified width, height, padding, and level of
+ *        chrominance subsampling.  If the chrominance components are
+ *        subsampled along the horizontal dimension, then the width of the
+ *        luminance plane should be padded to the nearest multiple of 2 (same
+ *        goes for the height of the luminance plane, if the chrominance
+ *        components are subsampled along the vertical dimension.)  This is
+ *        irrespective of any additional padding specified in the <tt>pad</tt>
+ *        parameter.
+ * @param width width (in pixels) of the source image
+ * @param pad the line padding used in the source image.  For instance, if each
+ *        line in each plane of the YUV image is padded to the nearest multiple
+ *        of 4 bytes, then <tt>pad</tt> should be set to 4.
+ * @param height height (in pixels) of the source image
+ * @param subsamp the level of chrominance subsampling used in the source
+ *        image (see @ref TJSAMP "Chrominance subsampling options".)
+ * @param jpegBuf address of a pointer to an image buffer that will receive the
+ *        JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer
+ *        to accommodate the size of the JPEG image.  Thus, you can choose to:
+ *        -# pre-allocate the JPEG buffer with an arbitrary size using
+ *        #tjAlloc() and let TurboJPEG grow the buffer as needed,
+ *        -# set <tt>*jpegBuf</tt> to NULL to tell TurboJPEG to allocate the
+ *        buffer for you, or
+ *        -# pre-allocate the buffer to a "worst case" size determined by
+ *        calling #tjBufSize().  This should ensure that the buffer never has
+ *        to be re-allocated (setting #TJFLAG_NOREALLOC guarantees this.)
+ *        .
+ *        If you choose option 1, <tt>*jpegSize</tt> should be set to the
+ *        size of your pre-allocated buffer.  In any case, unless you have
+ *        set #TJFLAG_NOREALLOC, you should always check <tt>*jpegBuf</tt> upon
+ *        return from this function, as it may have changed.
+ * @param jpegSize pointer to an unsigned long variable that holds the size of
+ *        the JPEG image buffer.  If <tt>*jpegBuf</tt> points to a
+ *        pre-allocated buffer, then <tt>*jpegSize</tt> should be set to the
+ *        size of the buffer.  Upon return, <tt>*jpegSize</tt> will contain the
+ *        size of the JPEG image (in bytes.)
+ * @param jpegQual the image quality of the generated JPEG image (1 = worst,
+          100 = best)
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ *        "flags".
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+*/
+DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle, unsigned char *srcBuf,
+  int width, int pad, int height, int subsamp, unsigned char **jpegBuf,
+  unsigned long *jpegSize, int jpegQual, int flags);
+
+
+/**
  * The maximum size of the buffer (in bytes) required to hold a JPEG image with
  * the given parameters.  The number of bytes returned by this function is
  * larger than the size of the uncompressed source image.  The reason for this
@@ -592,6 +718,8 @@
  * the given parameters.
  *
  * @param width width of the image (in pixels)
+ * @param pad the width of each line in each plane of the image is padded to
+ *        the nearest multiple of this number of bytes (must be a power of 2.)
  * @param height height of the image (in pixels)
  * @param subsamp level of chrominance subsampling in the image (see
  *        @ref TJSAMP "Chrominance subsampling options".)
@@ -599,22 +727,22 @@
  * @return the size of the buffer (in bytes) required to hold the image, or
  * -1 if the arguments are out of bounds.
  */
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
   int subsamp);
 
 
 /**
  * Encode an RGB or grayscale image into a YUV planar image.  This function
- * uses the accelerated color conversion routines in TurboJPEG's underlying
- * codec to produce a planar YUV image that is suitable for X Video.
- * Specifically, if the chrominance components are subsampled along the
- * horizontal dimension, then the width of the luminance plane is padded to the
- * nearest multiple of 2 in the output image (same goes for the height of the
- * luminance plane, if the chrominance components are subsampled along the
- * vertical dimension.)  Also, each line of each plane in the output image is
- * padded to 4 bytes.  Although this will work with any subsampling option, it
- * is really only useful in combination with TJ_420, which produces an image
- * compatible with the I420 (AKA "YUV420P") format.
+ * uses the accelerated color conversion routines in the underlying
+ * codec but does not execute any of the other steps in the JPEG compression
+ * process.  The Y, U (Cb), and V (Cr) image planes are stored sequentially
+ * into the destination buffer, and the size of each plane is determined by the
+ * width and height of the source image, as well as the specified padding and
+ * level of chrominance subsampling.  If the chrominance components are
+ * subsampled along the horizontal dimension, then the width of the luminance
+ * plane is padded to the nearest multiple of 2 in the output image (same goes
+ * for the height of the luminance plane, if the chrominance components are
+ * subsampled along the vertical dimension.)
  * <p>
  * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
  * convention of the digital video community, the TurboJPEG API uses "YUV" to
@@ -635,20 +763,26 @@
  * @param pixelFormat pixel format of the source image (see @ref TJPF
  *        "Pixel formats".)
  * @param dstBuf pointer to an image buffer that will receive the YUV image.
- *        Use #tjBufSizeYUV() to determine the appropriate size for this buffer
- *        based on the image width, height, and level of chrominance
- *        subsampling.
+ *        Use #tjBufSizeYUV2() to determine the appropriate size for this
+ *        buffer based on the image width, height, padding, and level of
+ *        chrominance subsampling.
+ * @param pad the width of each line in each plane of the YUV image will be
+ *        padded to the nearest multiple of this number of bytes (must be a
+ *        power of 2.)  To generate images suitable for X Video, <tt>pad</tt>
+ *        should be set to 4.
  * @param subsamp the level of chrominance subsampling to be used when
  *        generating the YUV image (see @ref TJSAMP
- *        "Chrominance subsampling options".)
+ *        "Chrominance subsampling options".)  To generate images suitable for
+ *        X Video, <tt>subsamp</tt> should be set to @ref TJSAMP_420.  This
+ *        produces an image compatible with the I420 (AKA "YUV420P") format.
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
  *        "flags".
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle,
+DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle,
   unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
-  unsigned char *dstBuf, int subsamp, int flags);
+  unsigned char *dstBuf, int pad, int subsamp, int flags);
 
 
 /**
@@ -673,12 +807,15 @@
  * @param jpegSubsamp pointer to an integer variable that will receive the
  *        level of chrominance subsampling used when compressing the JPEG image
  *        (see @ref TJSAMP "Chrominance subsampling options".)
+ * @param jpegColorspace pointer to an integer variable that will receive one
+ *        of the JPEG colorspace constants, indicating the colorspace of the
+ *        JPEG image (see @ref TJCS "JPEG colorspaces".)
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-  int *jpegSubsamp);
+  int *jpegSubsamp, int *jpegColorspace);
 
 
 /**
@@ -695,7 +832,7 @@
 
 
 /**
- * Decompress a JPEG image to an RGB or grayscale image.
+ * Decompress a JPEG image to an RGB, grayscale, or CMYK image.
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
  * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
@@ -722,8 +859,8 @@
  *        calling #TJSCALED() with the JPEG image width and one of the scaling
  *        factors returned by #tjGetScalingFactors().)  You can also be clever
  *        and use the pitch parameter to skip lines, etc.  Setting this
- *        parameter to 0 is the equivalent of setting it to <tt>scaledWidth
- *        * #tjPixelSize[pixelFormat]</tt>.
+ *        parameter to 0 is the equivalent of setting it to
+ *        <tt>scaledWidth * #tjPixelSize[pixelFormat]</tt>.
  * @param height desired height (in pixels) of the destination image.  If this
  *        is different than the height of the JPEG image being decompressed,
  *        then TurboJPEG will use scaling in the JPEG decompressor to generate
@@ -745,11 +882,11 @@
 /**
  * Decompress a JPEG image to a YUV planar image.  This function performs JPEG
  * decompression but leaves out the color conversion step, so a planar YUV
- * image is generated instead of an RGB image.  The padding of the planes in
- * this image is the same as in the images generated by #tjEncodeYUV2().  Note
- * that, if the width or height of the image is not an even multiple of the MCU
- * block size (see #tjMCUWidth and #tjMCUHeight), then an intermediate buffer
- * copy will be performed within TurboJPEG.
+ * image is generated instead of an RGB image.  The structure of the planes in
+ * this image is the same as in the images generated by #tjEncodeYUV3().  Note
+ * that, if the width or height of the JPEG image is not an even multiple of
+ * the MCU block size (see #tjMCUWidth and #tjMCUHeight), then an intermediate
+ * buffer copy will be performed within TurboJPEG.
  * <p>
  * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
  * convention of the digital video community, the TurboJPEG API uses "YUV" to
@@ -759,16 +896,87 @@
  * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
  * @param jpegSize size of the JPEG image (in bytes)
  * @param dstBuf pointer to an image buffer that will receive the YUV image.
- *        Use #tjBufSizeYUV() to determine the appropriate size for this buffer
- *        based on the image width, height, and level of subsampling.
+ *        Use #tjBufSizeYUV2() to determine the appropriate size for this
+ *        buffer based on the image width, height, padding, and level of
+ *        subsampling.
+ * @param width desired width (in pixels) of the YUV image.  If this is
+ *        different than the width of the JPEG image being decompressed, then
+ *        TurboJPEG will use scaling in the JPEG decompressor to generate the
+ *        largest possible image that will fit within the desired width.  If
+ *        <tt>width</tt> is set to 0, then only the height will be considered
+ *        when determining the scaled image size.
+ * @param pad the width of each line in each plane of the YUV image will be
+ *        padded to the nearest multiple of this number of bytes (must be a
+ *        power of 2.)  To generate images suitable for X Video, <tt>pad</tt>
+ *        should be set to 4.
+ * @param height desired height (in pixels) of the YUV image.  If this is
+ *        different than the height of the JPEG image being decompressed, then
+ *        TurboJPEG will use scaling in the JPEG decompressor to generate the
+ *        largest possible image that will fit within the desired height.  If
+ *        <tt>height</tt> is set to 0, then only the width will be considered
+ *        when determining the scaled image size.
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
  *        "flags".
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
-DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-  int flags);
+  int width, int pad, int height, int flags);
+
+
+/**
+ * Decode a YUV planar image into an RGB or grayscale image.  This function
+ * uses the accelerated color conversion routines in the underlying
+ * codec but does not execute any of the other steps in the JPEG decompression
+ * process.  The Y, U (Cb), and V (Cr) image planes should be stored
+ * sequentially in the source buffer, and the size of each plane is determined
+ * by the width and height of the source image, as well as the specified
+ * padding and level of chrominance subsampling.  If the chrominance components
+ * are subsampled along the horizontal dimension, then the width of the
+ * luminance plane should be padded to the nearest multiple of 2 in the input
+ * image (same goes for the height of the luminance plane, if the chrominance
+ * components are subsampled along the vertical dimension.)
+ * <p>
+ * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
+ * convention of the digital video community, the TurboJPEG API uses "YUV" to
+ * refer to an image format consisting of Y, Cb, and Cr image planes.
+ *
+ * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ * @param srcBuf pointer to an image buffer containing a YUV planar image to be
+ *        decoded.  The size of this buffer should match the value returned
+ *        by #tjBufSizeYUV2() for the given image width, height, padding, and
+ *        level of chrominance subsampling.
+ * @param pad Use this parameter to specify that the width of each line in each
+ *        plane of the YUV source image is padded to the nearest multiple of
+ *        this number of bytes (must be a power of 2.)
+ * @param subsamp the level of chrominance subsampling used in the YUV source
+ *        image (see @ref TJSAMP "Chrominance subsampling options".)
+ * @param dstBuf pointer to an image buffer that will receive the decoded
+ *        image.  This buffer should normally be <tt>pitch * height</tt>
+ *        bytes in size, but the <tt>dstBuf</tt> pointer can also be used to
+ *        decode into a specific region of a larger buffer.
+ * @param width width (in pixels) of the source and destination images
+ * @param pitch bytes per line of the destination image.  Normally, this should
+ *        be <tt>width * #tjPixelSize[pixelFormat]</tt> if the destination
+ *        image is unpadded, or <tt>#TJPAD(width *
+ *        #tjPixelSize[pixelFormat])</tt> if each line of the destination
+ *        image should be padded to the nearest 32-bit boundary, as is the case
+ *        for Windows bitmaps.  You can also be clever and use the pitch
+ *        parameter to skip lines, etc.  Setting this parameter to 0 is the
+ *        equivalent of setting it to <tt>width *
+ *        #tjPixelSize[pixelFormat]</tt>.
+ * @param height height (in pixels) of the source and destination images
+ * @param pixelFormat pixel format of the destination image (see @ref TJPF
+ *        "Pixel formats".)
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ *        "flags".
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, unsigned char *srcBuf,
+	int pad, int subsamp, unsigned char *dstBuf, int width, int pitch,
+	int height, int pixelFormat, int flags);
 
 
 /**
@@ -882,6 +1090,13 @@
 DLLEXPORT char* DLLCALL tjGetErrorStr(void);
 
 
+/* Deprecated functions and macros */
+#define TJFLAG_FORCEMMX        8
+#define TJFLAG_FORCESSE       16
+#define TJFLAG_FORCESSE2      32
+#define TJFLAG_FORCESSE3     128
+
+
 /* Backward compatibility functions and macros (nothing to see here) */
 #define NUMSUBOPT TJ_NUMSAMP
 #define TJ_444 TJSAMP_444
@@ -905,6 +1120,9 @@
 DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
   int jpegSubsamp);
 
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+  int subsamp);
+
 DLLEXPORT int DLLCALL tjCompress(tjhandle handle, unsigned char *srcBuf,
   int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
   unsigned long *compressedSize, int jpegSubsamp, int jpegQual, int flags);
@@ -913,13 +1131,25 @@
   unsigned char *srcBuf, int width, int pitch, int height, int pixelSize,
   unsigned char *dstBuf, int subsamp, int flags);
 
+DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle,
+  unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
+  unsigned char *dstBuf, int subsamp, int flags);
+
 DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height);
 
+DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
+  int *jpegSubsamp);
+
 DLLEXPORT int DLLCALL tjDecompress(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
   int width, int pitch, int height, int pixelSize, int flags);
 
+DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  int flags);
+
 
 /**
  * @}