Oops.  dumpbuf() was displaying only red components.


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@428 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/BUILDING.txt b/BUILDING.txt
index 50e2fb7..0af9fdf 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -30,6 +30,13 @@
 
 -- GCC v4.1 or later recommended for best performance
 
+-- If building the TurboJPEG/OSS JNI wrapper, jni.h is required.  Some systems,
+   such as OS X 10.4 and Solaris 10, have this header pre-installed.  On OS X
+   10.5 and later, the header can be obtained by installing the Java Developer
+   Package, which can be downloaded from http://connect.apple.com.  On Linux
+   and other systems, the header can be obtained by installing the GCJ
+   (GCC-Java) development packages or the Oracle Java Development Kit (JDK).
+
 
 ==================
 Out-of-Tree Builds
@@ -117,6 +124,14 @@
 disable encoding or decoding (respectively.)
 
 
+TurboJPEG/OSS JNI Wrapper
+-------------------------
+Add --with-jni to the configure command line to incorporate an optional Java
+Native Interface wrapper into the TurboJPEG/OSS dynamic library.  This allows
+the dynamic library to be used directly from Java applications.  See
+java/README for more details.
+
+
 ========================
 Installing libjpeg-turbo
 ========================
@@ -295,6 +310,15 @@
 -- NASM (http://www.nasm.us/) 0.98 or later (NASM 2.05 or later is required for
    a 64-bit build)
 
+-- If building the TurboJPEG/OSS JNI wrapper, jni.h is required.  This header
+   can be obtained by installing the Oracle Java Development Kit (JDK).
+   * If using Visual C++, then add the appropriate Java include directories
+     (Example:  c:\Program Files\Java\jdk1.6.0_23\include;c:\Program Files\Java\jdk1.6.0_23\include\win32)
+     to the INCLUDE environment variable prior to building libjpeg-turbo.
+   * If using MinGW, then add the appropriate Java include directories
+     (Example:  /c/Program Files/Java/jdk1.6.0_23/include:/c/Program Files/Java/jdk1.6.0_23/include/win32)
+     to the CPATH environment variable prior to building libjpeg-turbo.
+
 
 ==================
 Out-of-Tree Builds
@@ -434,6 +458,14 @@
 disable encoding or decoding (respectively.)
 
 
+TurboJPEG/OSS JNI Wrapper
+-------------------------
+Add "-DWITH_JNI=1" to the cmake command line to incorporate an optional Java
+Native Interface wrapper into the TurboJPEG/OSS dynamic library.  This allows
+the dynamic library to be used directly from Java applications.  See
+java/README for more details.
+
+
 ========================
 Installing libjpeg-turbo
 ========================
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c1d1b9..c7dc27f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 cmake_minimum_required(VERSION 2.6)
 
 project(libjpeg-turbo C)
-set(VERSION 1.1.0)
+set(VERSION 1.1.90)
 
 if(MINGW OR CYGWIN)
   execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
@@ -58,6 +58,16 @@
   message(STATUS "Arithmetic decoding support disabled")
 endif()
 
+if(NOT DEFINED WITH_JNI)
+  set(WITH_JNI 0)
+endif()
+
+if(WITH_JNI)
+  message(STATUS "TurboJPEG/OSS JNI wrapper enabled")
+else()
+  message(STATUS "TurboJPEG/OSS JNI wrapper disabled")
+endif()
+
 set(JPEG_LIB_VERSION 62)
 set(DLL_VERSION ${JPEG_LIB_VERSION})
 set(FULLVERSION ${DLL_VERSION}.0.0)
@@ -105,6 +115,14 @@
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR})
 
+if(WITH_JNI)
+  include(CheckIncludeFiles)
+  check_include_files(jni.h HAVE_JNI_H)
+  if(NOT HAVE_JNI_H)
+    message(FATAL_ERROR "Cannot find jni.h.  Be sure to add the Java include directories to the INCLUDE environment variable (MSVC) or the CPATH environment variable (GCC).")
+  endif()
+endif()
+
 
 #
 # Targets
@@ -155,8 +173,16 @@
   add_dependencies(jpeg-static simd)
 endif()
 
-add_library(turbojpeg SHARED turbojpegl.c)
+set(TURBOJPEG_SOURCES turbojpegl.c)
+if(WITH_JNI)
+  set(TURBOJPEG_SOURCES ${TURBOJPEG_SOURCES} turbojpeg-jni.c)
+endif()
+
+add_library(turbojpeg SHARED ${TURBOJPEG_SOURCES})
 set_target_properties(turbojpeg PROPERTIES DEFINE_SYMBOL DLLDEFINE)
+if(MINGW)
+  set_target_properties(turbojpeg PROPERTIES LINK_FLAGS -Wl,--kill-at)
+endif()
 target_link_libraries(turbojpeg jpeg-static)
 set_target_properties(turbojpeg PROPERTIES LINK_INTERFACE_LIBRARIES "")
 
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 12d1c9e..23e73d8 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,3 +1,11 @@
+1.1.90 (1.2 beta1)
+==================
+
+[1] Added a JNI wrapper for TurboJPEG/OSS.  See java/README for more details.
+
+[2] TurboJPEG/OSS can now scale down images during decompression.
+
+
 1.1.0
 =====
 
diff --git a/Makefile.am b/Makefile.am
index 1a8e532..312ec41 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -34,12 +34,25 @@
 
 endif
 
-libturbojpeg_la_SOURCES = $(libjpeg_la_SOURCES) turbojpegl.c turbojpeg.h \
-	turbojpeg-mapfile
+libturbojpeg_la_SOURCES = $(libjpeg_la_SOURCES) turbojpegl.c turbojpeg.h
+
+if WITH_JNI
+
+libturbojpeg_la_SOURCES += turbojpeg-jni.c
+libturbojpeg_la_CFLAGS = ${JAVA_CFLAGS}
+TJMAPFILE = turbojpeg-mapfile.jni
+
+else
+
+TJMAPFILE = turbojpeg-mapfile
+
+endif
+
+libturbojpeg_la_SOURCES += $(TJMAPFILE)
 
 if ANON_VERSION_SCRIPT
 
-libturbojpeg_la_LDFLAGS += $(ANON_VERSION_SCRIPT_FLAG)$(srcdir)/turbojpeg-mapfile
+libturbojpeg_la_LDFLAGS += $(ANON_VERSION_SCRIPT_FLAG)$(srcdir)/$(TJMAPFILE)
 
 endif
 
@@ -113,9 +126,9 @@
 TESTFILES= testorig.jpg testorig.ppm testimg.bmp testimgflt.jpg \
 	testimgfst.jpg testimgint.jpg testimgp.jpg testimgflt.ppm testimgfst.ppm \
 	testimgint.ppm testimgflt-nosimd.jpg testimgcrop.jpg testimgari.jpg \
-	testimgari.ppm testimgfst100.jpg
+	testimgari.ppm testimgfst100.jpg testimggray.jpg
 
-EXTRA_DIST = win release $(DOCS) $(TESTFILES) CMakeLists.txt \
+EXTRA_DIST = win release java $(DOCS) $(TESTFILES) CMakeLists.txt \
 	sharedlib/CMakeLists.txt cmakescripts libjpeg.map.in
 
 dist-hook:
@@ -137,6 +150,8 @@
 else
 	cmp $(srcdir)/testimgflt-nosimd.jpg testoutflt.jpg
 endif
+	./cjpeg -dct int -grayscale -outfile testoutgray.jpg $(srcdir)/testorig.ppm
+	cmp $(srcdir)/testimggray.jpg testoutgray.jpg
 	./djpeg -dct int -fast -ppm -outfile testoutint.ppm $(srcdir)/testorig.jpg
 	cmp $(srcdir)/testimgint.ppm testoutint.ppm
 	./djpeg -dct fast -ppm -outfile testoutfst.ppm $(srcdir)/testorig.jpg
diff --git a/configure.ac b/configure.ac
index c520b61..84fca81 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.56])
-AC_INIT([libjpeg-turbo], [1.1.0])
+AC_INIT([libjpeg-turbo], [1.1.90])
 BUILD=`date +%Y%m%d`
 
 AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
@@ -223,6 +223,48 @@
 
 AM_CONDITIONAL([WITH_ARITH], [test "x$with_arith_dec" != "xno" -o "x$with_arith_enc" != "xno"])
 
+AC_ARG_VAR(JAVA_CFLAGS, [Compiler flags needed to find jni.h (default: -I/System/Library/Frameworks/JavaVM.framework/Headers on OS X, '-I/usr/java/include -I/usr/java/include/solaris' on Solaris, and '-I/usr/java/default/include -I/usr/java/default/include/linux' on Linux)])
+
+AC_MSG_CHECKING([whether to include JNI wrapper in TurboJPEG/OSS])
+AC_ARG_WITH([jni],
+    AC_HELP_STRING([--with-jni],[Include JNI wrapper in the TurboJPEG/OSS library]))
+
+BUILDJNILIB=0
+RPM_CONFIG_ARGS=
+if test "x$with_jni" = "xyes"; then
+    AC_MSG_RESULT(yes)
+
+    case $host_os in
+      darwin*)
+        DEFAULT_JAVA_CFLAGS=-I/System/Library/Frameworks/JavaVM.framework/Headers
+        BUILDJNILIB=1
+        ;;
+      solaris*)
+        DEFAULT_JAVA_CFLAGS='-I/usr/java/include -I/usr/java/include/solaris'
+        ;;
+      linux*)
+        DEFAULT_JAVA_CFLAGS='-I/usr/java/default/include -I/usr/java/default/include/linux'
+        ;;
+    esac
+    if test "x$JAVA_CFLAGS" = "x"; then
+        JAVA_CFLAGS=$DEFAULT_JAVA_CFLAGS
+    fi
+
+    SAVE_CPPFLAGS=${CPPFLAGS}
+    CPPFLAGS="${CPPFLAGS} ${JAVA_CFLAGS}"
+    AC_CHECK_HEADERS([jni.h], [DUMMY=1],
+        [AC_MSG_ERROR([Could not find JNI header file])])
+    CPPFLAGS=${SAVE_CPPFLAGS}
+    AC_SUBST(JAVA_CFLAGS)
+
+    RPM_CONFIG_ARGS=--with-jni
+else
+    AC_MSG_RESULT(no)
+fi
+AM_CONDITIONAL([WITH_JNI], [test "x$with_jni" = "xyes"])
+AC_SUBST(BUILDJNILIB)
+AC_SUBST(RPM_CONFIG_ARGS)
+
 # SIMD is optional
 AC_ARG_WITH([simd],
     AC_HELP_STRING([--without-simd],[Omit accelerated SIMD routines.]))
diff --git a/java/README b/java/README
new file mode 100644
index 0000000..87d3181
--- /dev/null
+++ b/java/README
@@ -0,0 +1,52 @@
+TurboJPEG/OSS JNI Wrapper
+=========================
+
+TurboJPEG/OSS can optionally be built with a Java Native Interface wrapper,
+which allows the TurboJPEG/OSS dynamic library to be loaded and used directly
+from Java applications.  The Java front end for this is defined in several
+classes located under org/libjpegturbo/turbojpeg.  The source code for these
+Java classes is licensed under a BSD-style license, so the files can be
+incorporated directly into both open source and proprietary projects without
+restriction.
+
+TJExample.java, which should also be located in the same directory as this
+README file, demonstrates how to use the TurboJPEG/OSS Java front end to
+compress and decompress JPEG images in memory.
+
+  javac TJExample.java
+
+builds .class files for both the front end and example code.
+
+
+Note for OS X users
+-------------------
+
+/usr/lib, the directory under which libturbojpeg.dylib is installed on Mac
+systems, is not part of the normal Java library path.  Thus, when running a
+Java application that uses TurboJPEG/OSS on Mac systems, you will need to pass
+an argument of -Djava.library.path=/usr/lib to java.
+
+
+Note for Solaris users
+----------------------
+
+/opt/libjpeg-turbo/lib, the directory under which libturbojpeg.so is installed
+on Solaris systems, is not part of the normal Java library path.  Thus, when
+running a Java application that uses TurboJPEG/OSS on Solaris systems, you will
+need to pass an argument of -Djava.library.path=/opt/libjpeg-turbo/lib to java.
+If using a 64-bit data model, then instead pass an argument of
+-Djava.library.path=/opt/libjpeg-turbo/lib/amd64 to use the 64-bit version of
+libturbojpeg.so.
+
+
+Note for MinGW users
+--------------------
+
+When libjpeg-turbo is built with MinGW, the TurboJPEG/OSS dynamic library is
+named libturbojpeg.dll instead of turbojpeg.dll.  This is in keeping with the
+convention of MinGW, and it also avoids a filename conflict when the GCC and
+Visual C++ versions of the libjpeg-turbo SDK are installed on the same system.
+However, the TurboJPEG/OSS JNI wrapper will not work on Windows unless the DLL
+is named turbojpeg.dll.  You can work around this by renaming the DLL or by
+simply changing the LoadLibrary() calls in TurboJPEG.java so that they load
+"libturbojpeg" instead of "turbojpeg".
diff --git a/java/TJExample.java b/java/TJExample.java
new file mode 100644
index 0000000..3362e81
--- /dev/null
+++ b/java/TJExample.java
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This program demonstrates how to compress and decompress JPEG files using
+ * the TurboJPEG JNI wrapper
+ */
+
+import java.io.*;
+import org.libjpegturbo.turbojpeg.*;
+
+public class TJExample {
+
+  public static final String classname=new TJExample().getClass().getName();
+
+  private static void usage() {
+    System.out.println("\nUSAGE: java "+classname+" <Input file> <Output file> [options]\n");
+    System.out.println("Options:\n");
+    System.out.println("-scale 1/N = scale the width/height of the output image by a factor of 1/N");
+    System.out.println("             (N = 1, 2, 4, or 8}\n");
+    System.exit(1);
+  }
+
+  public static void main(String argv[]) {
+
+    try {
+
+      if(argv.length<2) {
+        usage();
+      }
+
+      int scalefactor=1;
+      if(argv.length>2) {
+        for(int i=2; i<argv.length; i++) {
+          if(argv[i].equalsIgnoreCase("-scale") && i<argv.length-1) {
+            String [] scalearg=argv[++i].split("/");      
+            if(scalearg.length!=2 || Integer.parseInt(scalearg[0])!=1
+              || (scalefactor=Integer.parseInt(scalearg[1]))<1
+              || scalefactor>8 || (scalefactor&(scalefactor-1))!=0)
+              usage();
+          }
+        }
+      }
+
+      File file=new File(argv[0]);
+      FileInputStream fis=new FileInputStream(file);
+      int inputsize=fis.available();
+      if(inputsize<1) {
+        System.out.println("Input file contains no data");
+        System.exit(1);
+      }
+      byte [] inputbuf=new byte[inputsize];
+      fis.read(inputbuf);
+      fis.close();
+
+      TJDecompressor tjd=new TJDecompressor(inputbuf);
+      int width=tjd.getWidth();
+      int height=tjd.getHeight();
+      int subsamp=tjd.getSubsamp();
+      System.out.print("Source Image: "+width+" x "+height+" pixels, ");
+      switch(subsamp) {
+        case TJ.SAMP444:  System.out.println("4:4:4 subsampling");  break;
+        case TJ.SAMP422:  System.out.println("4:2:2 subsampling");  break;
+        case TJ.SAMP420:  System.out.println("4:2:0 subsampling");  break;
+        case TJ.GRAYSCALE:  System.out.println("Grayscale");  break;
+        default:  System.out.println("Unknown subsampling");  break;
+      }
+
+      if(scalefactor!=1) {
+        width=(width+scalefactor-1)/scalefactor;
+        height=(height+scalefactor-1)/scalefactor;
+        System.out.println("Dest. Image:  "+width+" x "+height
+          +" pixels");
+      }
+
+      byte [] tmpbuf=tjd.decompress(width, 0, height, TJ.BGR, TJ.BOTTOMUP);
+      tjd.close();
+
+      TJCompressor tjc=new TJCompressor(tmpbuf, width, 0, height, TJ.BGR);
+      byte [] outputbuf=new byte[(int)TJ.bufSize(width, height)];
+      long outputsize=tjc.compress(outputbuf, subsamp, 95, TJ.BOTTOMUP);
+      tjc.close();
+
+      file=new File(argv[1]);
+      FileOutputStream fos=new FileOutputStream(file);
+      fos.write(outputbuf, 0, (int)outputsize);
+      fos.close();
+
+    } catch(Exception e) {
+      System.out.println(e);
+    }
+  }
+
+};
diff --git a/java/org/libjpegturbo/turbojpeg/TJ.java b/java/org/libjpegturbo/turbojpeg/TJ.java
new file mode 100644
index 0000000..6bbbd72
--- /dev/null
+++ b/java/org/libjpegturbo/turbojpeg/TJ.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.libjpegturbo.turbojpeg;
+
+final public class TJ {
+
+  // Subsampling options
+  final public static int
+    NUMSUBOPT  = 4,
+    SAMP444    = 0,
+    SAMP422    = 1,
+    SAMP420    = 2,
+    GRAYSCALE  = 3;
+
+  // Pixel formats
+  final public static int
+    NUMPIXFORMATS = 7,
+    RGB           = 0,
+    BGR           = 1,
+    RGBX          = 2,
+    BGRX          = 3,
+    XBGR          = 4,
+    XRGB          = 5,
+    YUV           = 6;
+
+  final public static int pixelSize[] = {
+    3, 3, 4, 4, 4, 4, 3
+  };
+
+  public static int getPixelSize(int pixelFormat) throws Exception {
+    if(pixelFormat < 0 || pixelFormat >= NUMPIXFORMATS)
+      throw new Exception("Invalid pixel format");
+    return pixelSize[pixelFormat];
+  }
+
+  // Flags
+  final public static int
+    BOTTOMUP     = 2,
+    FORCEMMX     = 8,
+    FORCESSE     = 16,
+    FORCESSE2    = 32,
+    FORCESSE3    = 128,
+    FASTUPSAMPLE = 256;
+
+  final private static int
+    TJ_BGR        = 1,
+    TJ_ALPHAFIRST = 64,
+    TJ_YUV        = 512;
+
+  final private static int flags[] = {
+    0, TJ_BGR, 0, TJ_BGR, TJ_BGR|TJ_ALPHAFIRST, TJ_ALPHAFIRST, TJ_YUV
+  };
+
+  public static int getFlags(int pixelFormat) throws Exception {
+    if(pixelFormat < 0 || pixelFormat >= NUMPIXFORMATS)
+      throw new Exception("Invalid pixel format");
+    return flags[pixelFormat];
+  }
+
+  public native final static long bufSize(int width, int height)
+    throws Exception;
+
+  public native final static long bufSizeYUV(int width, int height,
+    int subsamp)
+    throws Exception;
+};
diff --git a/java/org/libjpegturbo/turbojpeg/TJCompressor.java b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
new file mode 100644
index 0000000..38de21f
--- /dev/null
+++ b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.libjpegturbo.turbojpeg;
+
+public class TJCompressor {
+
+  public TJCompressor() throws Exception {
+    init();
+  }
+
+  public TJCompressor(byte [] buf, int width, int pitch, int height,
+    int pixelFormat) throws Exception {
+    setBitmapBuffer(buf, width, pitch, height, pixelFormat);
+  }
+
+  public void setBitmapBuffer(byte [] buf, int width, int pitch, int height,
+    int pixelFormat) throws Exception {
+    if(handle == 0) init();
+    if(buf == null || width < 1 || height < 1 || pitch < 0 || pixelFormat < 0
+      || pixelFormat >= TJ.NUMPIXFORMATS)
+      throw new Exception("Invalid argument in setBitmapBuffer()");
+    bitmapBuf = buf;
+    bitmapWidth = width;
+    if(pitch == 0) bitmapPitch = width * TJ.getPixelSize(pixelFormat);
+    else bitmapPitch = pitch;
+    bitmapHeight = height;
+    bitmapPixelFormat = pixelFormat;
+  }
+
+  public long compress(byte [] dstBuf, int jpegSubsamp, int jpegQual,
+    int flags) throws Exception {
+    return compress(bitmapBuf, bitmapWidth, bitmapPitch, bitmapHeight,
+      TJ.getPixelSize(bitmapPixelFormat), dstBuf, jpegSubsamp, jpegQual,
+        flags | TJ.getFlags(bitmapPixelFormat));
+  }
+
+  public void close() throws Exception {
+    destroy();
+  }
+
+  protected void finalize() throws Throwable {
+    try {
+      close();
+    } catch(Exception e) {
+    }
+    finally {
+      super.finalize();
+    }
+  };
+
+  private native void init() throws Exception;
+
+  private native void destroy() throws Exception;
+
+  // JPEG size in bytes is returned
+  private native long compress(byte [] srcBuf, int width, int pitch,
+    int height, int pixelSize, byte [] dstbuf, int jpegSubsamp, int jpegQual,
+    int flags) throws Exception;
+
+  static {
+    System.loadLibrary("turbojpeg");
+  }
+
+  private long handle = 0;
+  private byte [] bitmapBuf = null;
+  private int bitmapWidth = 0;
+  private int bitmapHeight = 0;
+  private int bitmapPitch = 0;
+  private int bitmapPixelFormat = -1;
+};
diff --git a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
new file mode 100644
index 0000000..446eb3c
--- /dev/null
+++ b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.libjpegturbo.turbojpeg;
+
+public class TJDecompressor {
+
+  public TJDecompressor() throws Exception {
+    init();
+  }
+
+  public TJDecompressor(byte [] buf) throws Exception {
+    setJPEGBuffer(buf);
+  }
+
+  public void setJPEGBuffer(byte [] buf) throws Exception {
+    if(handle == 0) init();
+    if(buf == null) throw new Exception("Invalid argument in setJPEGBuffer()");
+    jpegBuf = buf;
+    decompressHeader();
+  }
+
+  public int getWidth() throws Exception {
+    if(header.width < 1) throw new Exception("JPEG buffer not initialized");
+    return header.width;
+  }
+
+  public int getHeight() throws Exception {
+    if(header.height < 1) throw new Exception("JPEG buffer not initialized");
+    return header.height;
+  }
+
+  public int getSubsamp() throws Exception {
+    if(header.subsamp < 0) throw new Exception("JPEG buffer not initialized");
+    return header.subsamp;
+  }
+
+  public int getScaledWidth(int desired_width, int desired_height)
+    throws Exception {
+    if(header.width < 1 || header.height < 1)
+      throw new Exception("JPEG buffer not initialized");
+    return getScaledWidth(header.width, header.height, desired_width,
+      desired_height);
+  }
+
+  public int getScaledHeight(int output_width, int output_height)
+    throws Exception {
+    if(header.width < 1 || header.height < 1)
+      throw new Exception("JPEG buffer not initialized");
+    return getScaledHeight(header.width, header.height, output_width,
+      output_height);
+  }
+
+  public void decompress(byte [] dstBuf, int width, int pitch,
+    int height, int pixelFormat, int flags) throws Exception {
+    if(jpegBuf == null) throw new Exception("JPEG buffer not initialized");
+    decompress(jpegBuf, jpegBuf.length, dstBuf, width, pitch, height,
+      TJ.getPixelSize(pixelFormat), flags | TJ.getFlags(pixelFormat));
+  }
+
+  public byte [] decompress(int width, int pitch, int height,
+    int pixelFormat, int flags) throws Exception {
+    if(width < 0 || height < 0 || pitch < 0 || pixelFormat < 0
+      || pixelFormat >= TJ.NUMPIXFORMATS)
+      throw new Exception("Invalid argument in decompress()");
+    int pixelSize = TJ.getPixelSize(pixelFormat);
+    int scaledWidth = getScaledWidth(width, height);
+    int scaledHeight = getScaledHeight(width, height);
+    if(pitch == 0) pitch = scaledWidth * pixelSize;
+    long bufSize;
+    if(pixelFormat == TJ.YUV)
+      bufSize = TJ.bufSizeYUV(width, height, header.subsamp);
+    else bufSize = pitch * scaledHeight;
+    byte [] buf = new byte[(int)bufSize];
+    if(jpegBuf == null) throw new Exception("JPEG buffer not initialized");
+    decompress(jpegBuf, jpegBuf.length, buf, width, pitch, height,
+      TJ.getPixelSize(pixelFormat), flags | TJ.getFlags(pixelFormat));
+    return buf;
+  }
+
+  public void close() throws Exception {
+    destroy();
+  }
+
+  protected void finalize() throws Throwable {
+    try {
+      close();
+    } catch(Exception e) {
+    }
+    finally {
+      super.finalize();
+    }
+  };
+
+  private native void init() throws Exception;
+
+  private native void destroy() throws Exception;
+
+  private native TJHeaderInfo decompressHeader(byte [] srcBuf, long size)
+    throws Exception;
+
+  private void decompressHeader() throws Exception {
+    header = decompressHeader(jpegBuf, jpegBuf.length);
+  }
+
+  private native void decompress(byte [] srcBuf, long size, byte [] dstBuf,
+    int width, int pitch, int height, int pixelSize, int flags)
+    throws Exception;
+
+  private native int getScaledWidth(int input_width, int input_height,
+    int output_width, int output_height) throws Exception;
+
+  private native int getScaledHeight(int input_width, int input_height,
+    int output_width, int output_height) throws Exception;
+
+  static {
+    System.loadLibrary("turbojpeg");
+  }
+
+  private long handle = 0;
+  private byte [] jpegBuf = null;
+  TJHeaderInfo header = null;
+};
diff --git a/java/org/libjpegturbo/turbojpeg/TJHeaderInfo.java b/java/org/libjpegturbo/turbojpeg/TJHeaderInfo.java
new file mode 100644
index 0000000..e4ee59f
--- /dev/null
+++ b/java/org/libjpegturbo/turbojpeg/TJHeaderInfo.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.libjpegturbo.turbojpeg;
+
+public class TJHeaderInfo {
+  public int subsamp = -1;
+  public int width = -1;
+  public int height = -1;
+};
diff --git a/java/org_libjpegturbo_turbojpeg_TJ.h b/java/org_libjpegturbo_turbojpeg_TJ.h
new file mode 100644
index 0000000..7009251
--- /dev/null
+++ b/java/org_libjpegturbo_turbojpeg_TJ.h
@@ -0,0 +1,73 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_libjpegturbo_turbojpeg_TJ */
+
+#ifndef _Included_org_libjpegturbo_turbojpeg_TJ
+#define _Included_org_libjpegturbo_turbojpeg_TJ
+#ifdef __cplusplus
+extern "C" {
+#endif
+#undef org_libjpegturbo_turbojpeg_TJ_NUMSUBOPT
+#define org_libjpegturbo_turbojpeg_TJ_NUMSUBOPT 4L
+#undef org_libjpegturbo_turbojpeg_TJ_SAMP444
+#define org_libjpegturbo_turbojpeg_TJ_SAMP444 0L
+#undef org_libjpegturbo_turbojpeg_TJ_SAMP422
+#define org_libjpegturbo_turbojpeg_TJ_SAMP422 1L
+#undef org_libjpegturbo_turbojpeg_TJ_SAMP420
+#define org_libjpegturbo_turbojpeg_TJ_SAMP420 2L
+#undef org_libjpegturbo_turbojpeg_TJ_GRAYSCALE
+#define org_libjpegturbo_turbojpeg_TJ_GRAYSCALE 3L
+#undef org_libjpegturbo_turbojpeg_TJ_NUMPIXFORMATS
+#define org_libjpegturbo_turbojpeg_TJ_NUMPIXFORMATS 7L
+#undef org_libjpegturbo_turbojpeg_TJ_RGB
+#define org_libjpegturbo_turbojpeg_TJ_RGB 0L
+#undef org_libjpegturbo_turbojpeg_TJ_BGR
+#define org_libjpegturbo_turbojpeg_TJ_BGR 1L
+#undef org_libjpegturbo_turbojpeg_TJ_RGBX
+#define org_libjpegturbo_turbojpeg_TJ_RGBX 2L
+#undef org_libjpegturbo_turbojpeg_TJ_BGRX
+#define org_libjpegturbo_turbojpeg_TJ_BGRX 3L
+#undef org_libjpegturbo_turbojpeg_TJ_XBGR
+#define org_libjpegturbo_turbojpeg_TJ_XBGR 4L
+#undef org_libjpegturbo_turbojpeg_TJ_XRGB
+#define org_libjpegturbo_turbojpeg_TJ_XRGB 5L
+#undef org_libjpegturbo_turbojpeg_TJ_YUV
+#define org_libjpegturbo_turbojpeg_TJ_YUV 6L
+#undef org_libjpegturbo_turbojpeg_TJ_BOTTOMUP
+#define org_libjpegturbo_turbojpeg_TJ_BOTTOMUP 2L
+#undef org_libjpegturbo_turbojpeg_TJ_FORCEMMX
+#define org_libjpegturbo_turbojpeg_TJ_FORCEMMX 8L
+#undef org_libjpegturbo_turbojpeg_TJ_FORCESSE
+#define org_libjpegturbo_turbojpeg_TJ_FORCESSE 16L
+#undef org_libjpegturbo_turbojpeg_TJ_FORCESSE2
+#define org_libjpegturbo_turbojpeg_TJ_FORCESSE2 32L
+#undef org_libjpegturbo_turbojpeg_TJ_FORCESSE3
+#define org_libjpegturbo_turbojpeg_TJ_FORCESSE3 128L
+#undef org_libjpegturbo_turbojpeg_TJ_FASTUPSAMPLE
+#define org_libjpegturbo_turbojpeg_TJ_FASTUPSAMPLE 256L
+#undef org_libjpegturbo_turbojpeg_TJ_TJ_BGR
+#define org_libjpegturbo_turbojpeg_TJ_TJ_BGR 1L
+#undef org_libjpegturbo_turbojpeg_TJ_TJ_ALPHAFIRST
+#define org_libjpegturbo_turbojpeg_TJ_TJ_ALPHAFIRST 64L
+#undef org_libjpegturbo_turbojpeg_TJ_TJ_YUV
+#define org_libjpegturbo_turbojpeg_TJ_TJ_YUV 512L
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJ
+ * Method:    bufSize
+ * Signature: (II)J
+ */
+JNIEXPORT jlong JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSize
+  (JNIEnv *, jclass, jint, jint);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJ
+ * Method:    bufSizeYUV
+ * Signature: (III)J
+ */
+JNIEXPORT jlong JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV
+  (JNIEnv *, jclass, jint, jint, jint);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/java/org_libjpegturbo_turbojpeg_TJCompressor.h b/java/org_libjpegturbo_turbojpeg_TJCompressor.h
new file mode 100644
index 0000000..090929c
--- /dev/null
+++ b/java/org_libjpegturbo_turbojpeg_TJCompressor.h
@@ -0,0 +1,37 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_libjpegturbo_turbojpeg_TJCompressor */
+
+#ifndef _Included_org_libjpegturbo_turbojpeg_TJCompressor
+#define _Included_org_libjpegturbo_turbojpeg_TJCompressor
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJCompressor
+ * Method:    init
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJCompressor
+ * Method:    destroy
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJCompressor
+ * Method:    compress
+ * Signature: ([BIIII[BIII)J
+ */
+JNIEXPORT jlong JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress
+  (JNIEnv *, jobject, jbyteArray, jint, jint, jint, jint, jbyteArray, jint, jint, jint);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/java/org_libjpegturbo_turbojpeg_TJDecompressor.h b/java/org_libjpegturbo_turbojpeg_TJDecompressor.h
new file mode 100644
index 0000000..3277bcf
--- /dev/null
+++ b/java/org_libjpegturbo_turbojpeg_TJDecompressor.h
@@ -0,0 +1,61 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_libjpegturbo_turbojpeg_TJDecompressor */
+
+#ifndef _Included_org_libjpegturbo_turbojpeg_TJDecompressor
+#define _Included_org_libjpegturbo_turbojpeg_TJDecompressor
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJDecompressor
+ * Method:    init
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_init
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJDecompressor
+ * Method:    destroy
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJDecompressor
+ * Method:    decompressHeader
+ * Signature: ([BJ)Lorg/libjpegturbo/turbojpeg/TJHeaderInfo;
+ */
+JNIEXPORT jobject JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader
+  (JNIEnv *, jobject, jbyteArray, jlong);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJDecompressor
+ * Method:    decompress
+ * Signature: ([BJ[BIIIII)V
+ */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
+  (JNIEnv *, jobject, jbyteArray, jlong, jbyteArray, jint, jint, jint, jint, jint);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJDecompressor
+ * Method:    getScaledWidth
+ * Signature: (IIII)I
+ */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_getScaledWidth
+  (JNIEnv *, jobject, jint, jint, jint, jint);
+
+/*
+ * Class:     org_libjpegturbo_turbojpeg_TJDecompressor
+ * Method:    getScaledHeight
+ * Signature: (IIII)I
+ */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_getScaledHeight
+  (JNIEnv *, jobject, jint, jint, jint, jint);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/jccolor.c b/jccolor.c
index 5c186d4..5e1c180 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -81,74 +81,6 @@
 #define TABLE_SIZE	(8*(MAXJSAMPLE+1))
 
 
-#if BITS_IN_JSAMPLE == 8
-
-static const unsigned char red_lut[256] = {
-  0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
-  5 , 5 , 5 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 9 , 9 , 9 ,
-  10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14,
-  14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
-  19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24,
-  24, 24, 25, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28,
-  29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33,
-  33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38,
-  38, 39, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 42, 43,
-  43, 43, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48,
-  48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52,
-  53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57,
-  57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62, 62,
-  62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67,
-  67, 67, 68, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71,
-  72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 76, 76, 76
-};
-
-static const unsigned char green_lut[256] = {
-  0  , 1  , 1  , 2  , 2  , 3  , 4  , 4  , 5  , 5  , 6  , 6  ,
-  7  , 8  , 8  , 9  , 9  , 10 , 11 , 11 , 12 , 12 , 13 , 14 ,
-  14 , 15 , 15 , 16 , 16 , 17 , 18 , 18 , 19 , 19 , 20 , 21 ,
-  21 , 22 , 22 , 23 , 23 , 24 , 25 , 25 , 26 , 26 , 27 , 28 ,
-  28 , 29 , 29 , 30 , 31 , 31 , 32 , 32 , 33 , 33 , 34 , 35 ,
-  35 , 36 , 36 , 37 , 38 , 38 , 39 , 39 , 40 , 41 , 41 , 42 ,
-  42 , 43 , 43 , 44 , 45 , 45 , 46 , 46 , 47 , 48 , 48 , 49 ,
-  49 , 50 , 50 , 51 , 52 , 52 , 53 , 53 , 54 , 55 , 55 , 56 ,
-  56 , 57 , 58 , 58 , 59 , 59 , 60 , 60 , 61 , 62 , 62 , 63 ,
-  63 , 64 , 65 , 65 , 66 , 66 , 67 , 68 , 68 , 69 , 69 , 70 ,
-  70 , 71 , 72 , 72 , 73 , 73 , 74 , 75 , 75 , 76 , 76 , 77 ,
-  77 , 78 , 79 , 79 , 80 , 80 , 81 , 82 , 82 , 83 , 83 , 84 ,
-  85 , 85 , 86 , 86 , 87 , 87 , 88 , 89 , 89 , 90 , 90 , 91 ,
-  92 , 92 , 93 , 93 , 94 , 95 , 95 , 96 , 96 , 97 , 97 , 98 ,
-  99 , 99 , 100, 100, 101, 102, 102, 103, 103, 104, 104, 105,
-  106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 112, 112,
-  113, 113, 114, 114, 115, 116, 116, 117, 117, 118, 119, 119,
-  120, 120, 121, 122, 122, 123, 123, 124, 124, 125, 126, 126,
-  127, 127, 128, 129, 129, 130, 130, 131, 131, 132, 133, 133,
-  134, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140,
-  141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147,
-  148, 149, 149, 150
-};
-
-static const unsigned char blue_lut[256] = {
-  0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 ,
-  2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 ,
-  4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ,
-  5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 ,
-  7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 ,
-  9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
-  11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
-  13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-  15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
-  16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
-  18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
-  20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
-  22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24,
-  24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-  26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
-  27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29
-};
-
-#endif
-
-
 /*
  * Initialize for RGB->YCC colorspace conversion.
  */
@@ -259,36 +191,26 @@
 		  JDIMENSION output_row, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  #if BITS_IN_JSAMPLE != 8
+  register int r, g, b;
   register INT32 * ctab = cconvert->rgb_ycc_tab;
-  #endif
   register JSAMPROW inptr;
   register JSAMPROW outptr;
-  JSAMPLE *maxoutptr;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->image_width;
-  int rindex = rgb_red[cinfo->in_color_space];
-  int gindex = rgb_green[cinfo->in_color_space];
-  int bindex = rgb_blue[cinfo->in_color_space];
-  int rgbstride = rgb_pixelsize[cinfo->in_color_space];
 
   while (--num_rows >= 0) {
     inptr = *input_buf++;
     outptr = output_buf[0][output_row];
-    maxoutptr = &outptr[num_cols];
     output_row++;
-    for (; outptr < maxoutptr; outptr++, inptr += rgbstride) {
+    for (col = 0; col < num_cols; col++) {
+      r = GETJSAMPLE(inptr[rgb_red[cinfo->in_color_space]]);
+      g = GETJSAMPLE(inptr[rgb_green[cinfo->in_color_space]]);
+      b = GETJSAMPLE(inptr[rgb_blue[cinfo->in_color_space]]);
+      inptr += rgb_pixelsize[cinfo->in_color_space];
       /* Y */
-      #if BITS_IN_JSAMPLE == 8
-      *outptr = red_lut[inptr[rindex]] + green_lut[inptr[gindex]]
-	    + blue_lut[inptr[bindex]];
-      #else
-      *outptr = (JSAMPLE)
-	    ((ctab[GETJSAMPLE(inptr[rindex])+R_Y_OFF]
-	     + ctab[GETJSAMPLE(inptr[gindex])+G_Y_OFF]
-	     + ctab[GETJSAMPLE(inptr[bindex])+B_Y_OFF])
-	     >> SCALEBITS);
-      #endif
+      outptr[col] = (JSAMPLE)
+		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
+		 >> SCALEBITS);
     }
   }
 }
@@ -490,8 +412,12 @@
              cinfo->in_color_space == JCS_EXT_BGRX ||
              cinfo->in_color_space == JCS_EXT_XBGR ||
              cinfo->in_color_space == JCS_EXT_XRGB) {
-      cconvert->pub.start_pass = rgb_ycc_start;
-      cconvert->pub.color_convert = rgb_gray_convert;
+      if (jsimd_can_rgb_gray())
+        cconvert->pub.color_convert = jsimd_rgb_gray_convert;
+      else {
+        cconvert->pub.start_pass = rgb_ycc_start;
+        cconvert->pub.color_convert = rgb_gray_convert;
+      }
     } else if (cinfo->in_color_space == JCS_YCbCr)
       cconvert->pub.color_convert = grayscale_convert;
     else
diff --git a/jpegut.c b/jpegut.c
index 5a864ea..9c41e50 100644
--- a/jpegut.c
+++ b/jpegut.c
@@ -103,8 +103,10 @@
 	}
 }
 
-void dumpbuf(unsigned char *buf, int w, int h, int ps, int flags)
+void dumpbuf(unsigned char *buf, int w, int h, int ps, int scalefactor,
+	int flags)
 {
+	printf("\n");
 	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
 		j;
 	for(i=0; i<h; i++)
@@ -118,15 +120,17 @@
 	}
 }
 
-int checkbuf(unsigned char *buf, int w, int h, int ps, int subsamp, int flags)
+int checkbuf(unsigned char *buf, int w, int h, int ps, int subsamp,
+	int scalefactor, int flags)
 {
 	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
 		_i, j;
 	if(flags&TJ_ALPHAFIRST) {roffset++;  goffset++;  boffset++;}
 	if(ps==1) roffset=goffset=boffset=0;
+	int halfway=16/scalefactor, blocksize=8/scalefactor;
 	if(subsamp==TJ_GRAYSCALE)
 	{
-		for(_i=0; _i<16; _i++)
+		for(_i=0; _i<halfway; _i++)
 		{
 			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
 			for(j=0; j<w; j++)
@@ -134,7 +138,7 @@
 				unsigned char r=buf[(w*i+j)*ps+roffset],
 					g=buf[(w*i+j)*ps+goffset],
 					b=buf[(w*i+j)*ps+boffset];
-				if(((_i/8)+(j/8))%2==0)
+				if(((_i/blocksize)+(j/blocksize))%2==0)
 				{
 					if(r<253 || g<253 || b<253) return 0;
 				}
@@ -144,7 +148,7 @@
 				}
 			}
 		}
-		for(_i=16; _i<h; _i++)
+		for(_i=halfway; _i<h; _i++)
 		{
 			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
 			for(j=0; j<w; j++)
@@ -152,7 +156,7 @@
 				unsigned char r=buf[(w*i+j)*ps+roffset],
 					g=buf[(w*i+j)*ps+goffset],
 					b=buf[(w*i+j)*ps+boffset];
-				if(((_i/8)+(j/8))%2==0)
+				if(((_i/blocksize)+(j/blocksize))%2==0)
 				{
 					if(r>2 || g>2 || b>2) return 0;
 				}
@@ -165,13 +169,13 @@
 	}
 	else
 	{
-		for(_i=0; _i<16; _i++)
+		for(_i=0; _i<halfway; _i++)
 		{
 			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
 			for(j=0; j<w; j++)
 			{
 				if(buf[(w*i+j)*ps+roffset]<253) return 0;
-				if(((_i/8)+(j/8))%2==0)
+				if(((_i/blocksize)+(j/blocksize))%2==0)
 				{
 					if(buf[(w*i+j)*ps+goffset]<253) return 0;
 					if(buf[(w*i+j)*ps+boffset]<253) return 0;
@@ -183,13 +187,13 @@
 				}
 			}
 		}
-		for(_i=16; _i<h; _i++)
+		for(_i=halfway; _i<h; _i++)
 		{
 			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
 			for(j=0; j<w; j++)
 			{
 				if(buf[(w*i+j)*ps+boffset]>2) return 0;
-				if(((_i/8)+(j/8))%2==0)
+				if(((_i/blocksize)+(j/blocksize))%2==0)
 				{
 					if(buf[(w*i+j)*ps+roffset]>2) return 0;
 					if(buf[(w*i+j)*ps+goffset]>2) return 0;
@@ -377,7 +381,8 @@
 	memset(jpegbuf, 0, TJBUFSIZE(w, h));
 
 	t=rrtime();
-	_catch(tjCompress(hnd, bmpbuf, w, 0, h, ps, jpegbuf, size, subsamp, qual, flags));
+	_catch(tjCompress(hnd, bmpbuf, w, 0, h, ps, jpegbuf, size, subsamp, qual,
+		flags));
 	t=rrtime()-t;
 
 	if(yuv==YUVENCODE)
@@ -399,16 +404,15 @@
 	if(bmpbuf) free(bmpbuf);
 }
 
-void gentestbmp(tjhandle hnd, unsigned char *jpegbuf, unsigned long jpegsize,
-	int w, int h, int ps, char *basefilename, int subsamp, int flags)
+void _gentestbmp(tjhandle hnd, unsigned char *jpegbuf, unsigned long jpegsize,
+	int w, int h, int ps, char *basefilename, int subsamp, int flags,
+	int scalefactor)
 {
 	unsigned char *bmpbuf=NULL;
-	const char *pixformat;  int _w=0, _h=0;  double t;
+	const char *pixformat;  int _hdrw=0, _hdrh=0, _hdrsubsamp=-1;  double t;
+	int scaledw=(w+scalefactor-1)/scalefactor, scaledh=(h+scalefactor-1)/scalefactor;
+	int temp1, temp2;
 	unsigned long size=0;
-	int hsf=_hsf[subsamp], vsf=_vsf[subsamp];
-	int pw=PAD(w, hsf), ph=PAD(h, vsf);
-	int cw=pw/hsf, ch=ph/vsf;
-	int ypitch=PAD(pw, 4), uvpitch=PAD(cw, 4);
 
 	if(yuv==YUVDECODE) flags|=TJ_YUV;
 	else if(yuv==YUVENCODE) return;
@@ -427,19 +431,31 @@
 	if(yuv==YUVDECODE)
 		printf("JPEG -> YUV %s ... ", _subnames[subsamp]);
 	else
-		printf("JPEG -> %s %s ... ", pixformat,
+	{
+		printf("JPEG -> %s %s ", pixformat,
 			(flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ");
+		if(scalefactor) printf("1/%d ... ", scalefactor);
+		else printf("... ");
+	}
 
-	_catch(tjDecompressHeader(hnd, jpegbuf, jpegsize, &_w, &_h));
-	if(_w!=w || _h!=h)
+	_catch(tjDecompressHeader2(hnd, jpegbuf, jpegsize, &_hdrw, &_hdrh,
+		&_hdrsubsamp));
+	if(_hdrw!=w || _hdrh!=h || _hdrsubsamp!=subsamp)
 	{
 		printf("Incorrect JPEG header\n");  bailout();
 	}
 
+	temp1=scaledw;  temp2=scaledh;
+	_catch(tjScaledSize(w, h, &temp1, &temp2));
+	if(temp1!=scaledw || temp2!=scaledh)
+	{
+		printf("Scaled size mismatch\n");  bailout();
+	}
+
 	if(yuv==YUVDECODE)
-		size=ypitch*ph + (subsamp==TJ_GRAYSCALE? 0:uvpitch*ch*2);
+		size=TJBUFSIZEYUV(w, h, subsamp);
 	else
-		size=w*h*ps;
+		size=scaledw*scaledh*ps;
 	if((bmpbuf=(unsigned char *)malloc(size+1))==NULL)
 	{
 		printf("ERROR: Could not allocate buffer\n");  bailout();
@@ -447,30 +463,48 @@
 	memset(bmpbuf, 0, size+1);
 
 	t=rrtime();
-	_catch(tjDecompress(hnd, jpegbuf, jpegsize, bmpbuf, w, w*ps, h, ps, flags));
+	_catch(tjDecompress(hnd, jpegbuf, jpegsize, bmpbuf, scaledw, 0, scaledh, ps,
+		flags));
 	t=rrtime()-t;
 
 	if(yuv==YUVDECODE)
 	{
-		if(checkbufyuv(bmpbuf, size, pw, ph, subsamp))
+		if(checkbufyuv(bmpbuf, size, w, h, subsamp))
 			printf("Passed.");
 		else {printf("FAILED!");  exitstatus=-1;}
 	}
 	else
 	{
-		if(checkbuf(bmpbuf, w, h, ps, subsamp, flags)) printf("Passed.");
+		if(checkbuf(bmpbuf, scaledw, scaledh, ps, subsamp, scalefactor, flags))
+			printf("Passed.");
 		else
 		{
 			printf("FAILED!");  exitstatus=-1;
-			dumpbuf(bmpbuf, w, h, ps, flags);
+			dumpbuf(bmpbuf, scaledw, scaledh, ps, scalefactor, flags);
 		}
 	}
-	printf("  %f ms\n\n", t*1000.);
+	printf("  %f ms\n", t*1000.);
 
 	finally:
 	if(bmpbuf) free(bmpbuf);
 }
 
+void gentestbmp(tjhandle hnd, unsigned char *jpegbuf, unsigned long jpegsize,
+	int w, int h, int ps, char *basefilename, int subsamp, int flags)
+{
+	int i;
+	if((subsamp==TJ_444 || subsamp==TJ_GRAYSCALE) && !yuv)
+	{
+		for(i=1; i<=8; i*=2)
+			_gentestbmp(hnd, jpegbuf, jpegsize, w, h, ps, basefilename, subsamp,
+				flags, i);
+	}
+	else
+		_gentestbmp(hnd, jpegbuf, jpegsize, w, h, ps, basefilename, subsamp,
+			flags, 1);
+	printf("\n");
+}
+
 void dotest(int w, int h, int ps, int subsamp, char *basefilename)
 {
 	tjhandle hnd=NULL, dhnd=NULL;  unsigned char *jpegbuf=NULL;
@@ -548,7 +582,7 @@
 				bmpbuf[i2*4+1]=pixels[i2%9][1];
 				bmpbuf[i2*2+2]=pixels[i2%9][0];
 			}
-			_catch(tjCompress(hnd, bmpbuf, i, i*4, j, 4,
+			_catch(tjCompress(hnd, bmpbuf, i, 0, j, 4,
 				jpgbuf, &size, TJ_444, 100, TJ_BGR));
 			free(bmpbuf);  bmpbuf=NULL;  free(jpgbuf);  jpgbuf=NULL;
 
@@ -562,7 +596,7 @@
 				if(i2%2==0) bmpbuf[i2]=0xFF;
 				else bmpbuf[i2]=0;
 			}
-			_catch(tjCompress(hnd, bmpbuf, j, j*4, i, 4,
+			_catch(tjCompress(hnd, bmpbuf, j, 0, i, 4,
 				jpgbuf, &size, TJ_444, 100, TJ_BGR));
 			free(bmpbuf);  bmpbuf=NULL;  free(jpgbuf);  jpgbuf=NULL;
 		}
diff --git a/jpgtest.c b/jpgtest.c
index 8451642..773a32e 100644
--- a/jpgtest.c
+++ b/jpgtest.c
@@ -33,7 +33,8 @@
 
 enum {YUVENCODE=1, YUVDECODE};
 int forcemmx=0, forcesse=0, forcesse2=0, forcesse3=0, fastupsample=0,
-	decomponly=0, yuv=0;
+	decomponly=0, yuv=0, quiet=0, dotile=0, pf=BMP_BGR, bu=0, useppm=0,
+	scalefactor=1;
 const int _ps[BMPPIXELFORMATS]={3, 4, 3, 4, 4, 4};
 const int _flags[BMPPIXELFORMATS]={0, 0, TJ_BGR, TJ_BGR,
 	TJ_BGR|TJ_ALPHAFIRST, TJ_ALPHAFIRST};
@@ -64,8 +65,150 @@
 	printf(format, val);
 }
 
-void dotest(unsigned char *srcbuf, int w, int h, int pf, int bu,
-	int jpegsub, int qual, char *filename, int dotile, int useppm, int quiet)
+// Decompression test
+int decomptest(unsigned char *srcbuf, unsigned char **jpegbuf,
+	unsigned long *comptilesize, unsigned char *rgbbuf, int w, int h,
+	int jpegsub, int qual, char *filename, int tilesizex, int tilesizey)
+{
+	char tempstr[1024], qualstr[5]="\0";
+	FILE *outfile=NULL;  tjhandle hnd=NULL;
+	int flags=(forcemmx?TJ_FORCEMMX:0)|(forcesse?TJ_FORCESSE:0)
+		|(forcesse2?TJ_FORCESSE2:0)|(forcesse3?TJ_FORCESSE3:0)
+		|(fastupsample?TJ_FASTUPSAMPLE:0);
+	int i, j, ITER, rgbbufalloc=0;
+	double start, elapsed;
+	int ps=_ps[pf];
+	int hsf=_hsf[jpegsub], vsf=_vsf[jpegsub];
+	int pw=PAD(w, hsf), ph=PAD(h, vsf);
+	int cw=pw/hsf, ch=ph/vsf;
+	int ypitch=PAD(pw, 4), uvpitch=PAD(cw, 4);
+	int yuvsize=ypitch*ph + (jpegsub==TJ_GRAYSCALE? 0:uvpitch*ch*2);
+	int scaledw=(flags&TJ_YUV)? w : (w+scalefactor-1)/scalefactor;
+	int scaledh=(flags&TJ_YUV)? h : (h+scalefactor-1)/scalefactor;
+	int pitch=scaledw*ps;
+
+	if(qual>0)
+	{
+		snprintf(qualstr, 5, "Q%d", qual);
+		qualstr[4]=0;
+	}
+
+	flags |= _flags[pf];
+	if(bu) flags |= TJ_BOTTOMUP;
+	if(yuv==YUVDECODE) flags |= TJ_YUV;
+	if((hnd=tjInitDecompress())==NULL)
+		_throwtj("executing tjInitDecompress()");
+
+	if(rgbbuf==NULL)
+	{
+		if((rgbbuf=(unsigned char *)malloc(max(yuvsize, pitch*scaledh))) == NULL)
+			_throwunix("allocating image buffer");
+		rgbbufalloc=1;
+	}
+	// Grey image means decompressor did nothing
+	memset(rgbbuf, 127, max(yuvsize, pitch*scaledh));
+
+	if(tjDecompress(hnd, jpegbuf[0], comptilesize[0], rgbbuf, scaledw, pitch,
+		scaledh, ps, flags)==-1)
+		_throwtj("executing tjDecompress()");
+	ITER=0;
+	start=rrtime();
+	do
+	{
+		int tilen=0;
+		for(i=0; i<h; i+=tilesizey)
+		{
+			for(j=0; j<w; j+=tilesizex)
+			{
+				int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
+				if(tjDecompress(hnd, jpegbuf[tilen], comptilesize[tilen],
+					&rgbbuf[pitch*i+ps*j], scaledw, pitch, scaledh, ps, flags)==-1)
+					_throwtj("executing tjDecompress()");
+				tilen++;
+			}
+		}
+		ITER++;
+	}	while((elapsed=rrtime()-start)<5.);
+	if(tjDestroy(hnd)==-1) _throwtj("executing tjDestroy()");
+	hnd=NULL;
+	if(quiet)
+	{
+		printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
+		printf("\n");
+	}
+	else
+	{
+		printf("D--> Frame rate:           %f fps\n", (double)ITER/elapsed);
+		printf("     Dest. throughput:     %f Megapixels/sec\n",
+			(double)(w*h)/1000000.*(double)ITER/elapsed);
+	}
+	if(yuv==YUVDECODE)
+	{
+		sprintf(tempstr, "%s_%s%s.yuv", filename, _subnames[jpegsub], qualstr);
+		if((outfile=fopen(tempstr, "wb"))==NULL)
+			_throwunix("opening YUV image for output");
+		if(fwrite(rgbbuf, yuvsize, 1, outfile)!=1)
+			_throwunix("writing YUV image");
+		fclose(outfile);  outfile=NULL;
+	}
+	else
+	{
+		if(tilesizex==w && tilesizey==h)
+		{
+			if(decomponly)
+				sprintf(tempstr, "%s_full.%s", filename, useppm?"ppm":"bmp");
+			else
+				sprintf(tempstr, "%s_%s%s_full.%s", filename, _subnames[jpegsub],
+					qualstr, useppm?"ppm":"bmp");
+		}
+		else sprintf(tempstr, "%s_%s%s_%dx%d.%s", filename, _subnames[jpegsub],
+			qualstr, tilesizex, tilesizey, useppm?"ppm":"bmp");
+		if(savebmp(tempstr, rgbbuf, scaledw, scaledh, pf, pitch, bu)==-1)
+			_throwbmp("saving bitmap");
+		sprintf(strrchr(tempstr, '.'), "-err.%s", useppm?"ppm":"bmp");
+		if(srcbuf && scalefactor==1)
+		{
+			if(!quiet)
+				printf("Computing compression error and saving to %s.\n", tempstr);
+			if(jpegsub==TJ_GRAYSCALE)
+			{
+				for(j=0; j<h; j++)
+				{
+					for(i=0; i<w*ps; i+=ps)
+					{
+						int y=(int)((double)srcbuf[w*ps*j+i+_rindex[pf]]*0.299
+							+ (double)srcbuf[w*ps*j+i+_gindex[pf]]*0.587
+							+ (double)srcbuf[w*ps*j+i+_bindex[pf]]*0.114 + 0.5);
+						if(y>255) y=255;  if(y<0) y=0;
+						rgbbuf[pitch*j+i+_rindex[pf]]=abs(rgbbuf[pitch*j+i+_rindex[pf]]-y);
+						rgbbuf[pitch*j+i+_gindex[pf]]=abs(rgbbuf[pitch*j+i+_gindex[pf]]-y);
+						rgbbuf[pitch*j+i+_bindex[pf]]=abs(rgbbuf[pitch*j+i+_bindex[pf]]-y);
+					}
+				}
+			}		
+			else
+			{
+				for(j=0; j<h; j++) for(i=0; i<w*ps; i++)
+					rgbbuf[pitch*j+i]=abs(rgbbuf[pitch*j+i]-srcbuf[w*ps*j+i]);
+			}
+			if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
+				_throwbmp("saving bitmap");
+		}
+	}
+
+	if(hnd) {tjDestroy(hnd);  hnd=NULL;}
+	if(rgbbuf && rgbbufalloc) {free(rgbbuf);  rgbbuf=NULL;}
+	return 0;
+
+	bailout:
+	if(outfile) {fclose(outfile);  outfile=NULL;}
+	if(hnd) {tjDestroy(hnd);  hnd=NULL;}
+	if(rgbbuf && rgbbufalloc) {free(rgbbuf);  rgbbuf=NULL;}
+	return -1;
+}
+
+void dotest(unsigned char *srcbuf, int w, int h, int jpegsub, int qual,
+	char *filename)
 {
 	char tempstr[1024];
 	FILE *outfile=NULL;  tjhandle hnd;
@@ -191,89 +334,9 @@
 		if(yuv==YUVENCODE) goto bailout;
 
 		// Decompression test
-		if(yuv==YUVDECODE) flags |= TJ_YUV;
-		memset(rgbbuf, 127, max(yuvsize, pitch*h));  // Grey image means decompressor did nothing
-		if((hnd=tjInitDecompress())==NULL)
-			_throwtj("executing tjInitDecompress()");
-		if(tjDecompress(hnd, jpegbuf[0], jpgbufsize, rgbbuf, tilesizex, pitch,
-			tilesizey, ps, flags)==-1)
-			_throwtj("executing tjDecompress()");
-		ITER=0;
-		start=rrtime();
-		do
-		{
-			int tilen=0;
-			for(i=0; i<h; i+=tilesizey)
-			{
-				for(j=0; j<w; j+=tilesizex)
-				{
-					int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
-					if(tjDecompress(hnd, jpegbuf[tilen], comptilesize[tilen],
-						&rgbbuf[pitch*i+ps*j], tempw, pitch, temph, ps, flags)==-1)
-						_throwtj("executing tjDecompress()");
-					tilen++;
-				}
-			}
-			ITER++;
-		}	while((elapsed=rrtime()-start)<5.);
-		if(tjDestroy(hnd)==-1) _throwtj("executing tjDestroy()");
-		hnd=NULL;
-		if(quiet)
-		{
-			printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
-			printf("\n");
-		}
-		else
-		{
-			printf("D--> Frame rate:           %f fps\n", (double)ITER/elapsed);
-			printf("     Dest. throughput:     %f Megapixels/sec\n",
-				(double)(w*h)/1000000.*(double)ITER/elapsed);
-		}
-		if(yuv==YUVDECODE)
-		{
-			sprintf(tempstr, "%s_%sQ%d.yuv", filename, _subnames[jpegsub], qual);
-			if((outfile=fopen(tempstr, "wb"))==NULL)
-				_throwunix("opening YUV image for output");
-			if(fwrite(rgbbuf, yuvsize, 1, outfile)!=1)
-				_throwunix("writing YUV image");
-			fclose(outfile);  outfile=NULL;
-		}
-		else
-		{
-			if(tilesizex==w && tilesizey==h)
-				sprintf(tempstr, "%s_%sQ%d_full.%s", filename, _subnames[jpegsub], qual,
-					useppm?"ppm":"bmp");
-			else sprintf(tempstr, "%s_%sQ%d_%dx%d.%s", filename, _subnames[jpegsub],
-				qual, tilesizex, tilesizey, useppm?"ppm":"bmp");
-			if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
-				_throwbmp("saving bitmap");
-			sprintf(strrchr(tempstr, '.'), "-err.%s", useppm?"ppm":"bmp");
-			if(!quiet)
-				printf("Computing compression error and saving to %s.\n", tempstr);
-			if(jpegsub==TJ_GRAYSCALE)
-			{
-				for(j=0; j<h; j++)
-				{
-					for(i=0; i<w*ps; i+=ps)
-					{
-						int y=(int)((double)srcbuf[w*ps*j+i+_rindex[pf]]*0.299
-							+ (double)srcbuf[w*ps*j+i+_gindex[pf]]*0.587
-							+ (double)srcbuf[w*ps*j+i+_bindex[pf]]*0.114 + 0.5);
-						if(y>255) y=255;  if(y<0) y=0;
-						rgbbuf[pitch*j+i+_rindex[pf]]=abs(rgbbuf[pitch*j+i+_rindex[pf]]-y);
-						rgbbuf[pitch*j+i+_gindex[pf]]=abs(rgbbuf[pitch*j+i+_gindex[pf]]-y);
-						rgbbuf[pitch*j+i+_bindex[pf]]=abs(rgbbuf[pitch*j+i+_bindex[pf]]-y);
-					}
-				}
-			}		
-			else
-			{
-				for(j=0; j<h; j++) for(i=0; i<w*ps; i++)
-					rgbbuf[pitch*j+i]=abs(rgbbuf[pitch*j+i]-srcbuf[w*ps*j+i]);
-			}
-			if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
-				_throwbmp("saving bitmap");
-		}
+		if(decomptest(srcbuf, jpegbuf, comptilesize, rgbbuf, w, h, jpegsub, qual,
+			filename, tilesizex, tilesizey)==-1)
+			goto bailout;
 
 		// Cleanup
 		if(outfile) {fclose(outfile);  outfile=NULL;}
@@ -304,25 +367,15 @@
 }
 
 
-void dodecomptest(char *filename, int pf, int bu, int useppm,
-	int quiet)
+void dodecomptest(char *filename)
 {
-	char tempstr[1024];
-	FILE *file=NULL;  tjhandle hnd;
-	unsigned char *jpegbuf=NULL, *rgbbuf=NULL;
-	double start, elapsed;
-	int w, h, ITER;
+	FILE *file=NULL;  tjhandle hnd=NULL;
+	unsigned char *jpegbuf=NULL;
+	int w=0, h=0, jpegsub=-1;
 	unsigned long jpgbufsize=0;
-	int flags=(forcemmx?TJ_FORCEMMX:0)|(forcesse?TJ_FORCESSE:0)
-		|(forcesse2?TJ_FORCESSE2:0)|(forcesse3?TJ_FORCESSE3:0)
-		|(fastupsample?TJ_FASTUPSAMPLE:0);
-	int ps=_ps[pf], pitch, jpegsub=-1;
 	char *temp=NULL;
-	int hsf, vsf, pw, ph, cw, ch, ypitch, uvpitch, yuvsize;
 
-	flags |= _flags[pf];
-	if(bu) flags |= TJ_BOTTOMUP;
-	if(yuv==YUVDECODE) flags |= TJ_YUV;
+	useppm=1;
 
 	if((file=fopen(filename, "rb"))==NULL)
 		_throwunix("opening file");
@@ -342,81 +395,36 @@
 	if((hnd=tjInitDecompress())==NULL) _throwtj("executing tjInitDecompress()");
 	if(tjDecompressHeader2(hnd, jpegbuf, jpgbufsize, &w, &h, &jpegsub)==-1)
 		_throwtj("executing tjDecompressHeader2()");
-
-	hsf=_hsf[jpegsub], vsf=_vsf[jpegsub];
-	pw=PAD(w, hsf), ph=PAD(h, vsf);
-	cw=pw/hsf, ch=ph/vsf;
-	ypitch=PAD(pw, 4), uvpitch=PAD(cw, 4);
-	yuvsize=ypitch*ph + (jpegsub==TJ_GRAYSCALE? 0:uvpitch*ch*2);
-
-	pitch=w*ps;
+	if(tjDestroy(hnd)==-1) _throwtj("executing tjDestroy()");
+	hnd=NULL;
 
 	if(quiet==1)
 	{
 		printf("\nAll performance values in Mpixels/sec\n\n");
-		printf("Bitmap\tBitmap\tImage Size\tDecomp\n"),
-		printf("Format\tOrder\t X    Y  \tPerf\n\n");
-		printf("%s\t%s\t%-4d %-4d\t", _pfname[pf], bu?"BU":"TD", w, h);
+		printf("Bitmap\tBitmap\tJPEG\tImage Size\tDecomp\n"),
+		printf("Format\tOrder\tFormat\t  X    Y  \tPerf\n\n");
+		printf("%s\t%s\t%s\t%-4d  %-4d\t", _pfname[pf], bu?"BU":"TD",
+			_subnamel[jpegsub], w, h);
 	}
-
-	if((rgbbuf=(unsigned char *)malloc(max(yuvsize, pitch*h)))==NULL)
-		_throwunix("allocating image buffer");
-
-	if(!quiet)
+	else
 	{
 		if(yuv==YUVDECODE)
 			printf("\n>>>>>  JPEG --> YUV %s  <<<<<\n", _subnamel[jpegsub]);
 		else
 			printf("\n>>>>>  JPEG --> %s (%s)  <<<<<\n", _pfname[pf],
 				bu?"Bottom-up":"Top-down");
-		printf("\nImage size: %d x %d\n", w, h);
-	}
-
-	memset(rgbbuf, 127, max(yuvsize, pitch*h));  // Grey image means decompressor did nothing
-	if(tjDecompress(hnd, jpegbuf, jpgbufsize, rgbbuf, w, pitch, h, ps, flags)==-1)
-		_throwtj("executing tjDecompress()");
-	ITER=0;
-	start=rrtime();
-	do
-	{
-		if(tjDecompress(hnd, jpegbuf, jpgbufsize, rgbbuf, w, pitch, h, ps, flags)
-			==-1)
-			_throwtj("executing tjDecompress()");
-		ITER++;
-	}	while((elapsed=rrtime()-start)<5.);
-	if(tjDestroy(hnd)==-1) _throwtj("executing tjDestroy()");
-	hnd=NULL;
-	if(quiet)
-	{
-		printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
+		printf("\nImage size: %d x %d", w, h);
+		if(scalefactor!=1) printf(" --> %d x %d", (w+scalefactor-1)/scalefactor,
+			(h+scalefactor-1)/scalefactor);
 		printf("\n");
 	}
-	else
-	{
-		printf("D--> Frame rate:           %f fps\n", (double)ITER/elapsed);
-		printf("     Dest. throughput:     %f Megapixels/sec\n",
-			(double)(w*h)/1000000.*(double)ITER/elapsed);
-	}
-	sprintf(tempstr, "%s_full.%s", filename, useppm?"ppm":"bmp");
-	if(yuv==YUVDECODE)
-	{
-		sprintf(tempstr, "%s_%s.yuv", filename, _subnames[jpegsub]);
-		if((file=fopen(tempstr, "wb"))==NULL)
-			_throwunix("opening YUV image for output");
-		if(fwrite(rgbbuf, yuvsize, 1, file)!=1)
-			_throwunix("writing YUV image");
-		fclose(file);  file=NULL;
-	}
-	else
-	{
-		if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
-			_throwbmp("saving bitmap");
-	}
+
+	decomptest(NULL, &jpegbuf, &jpgbufsize, NULL, w, h, jpegsub, 0, filename, w,
+		h);
 
 	bailout:
 	if(file) {fclose(file);  file=NULL;}
 	if(jpegbuf) {free(jpegbuf);  jpegbuf=NULL;}
-	if(rgbbuf) {free(rgbbuf);  rgbbuf=NULL;}
 	if(hnd) {tjDestroy(hnd);  hnd=NULL;}
 	return;
 }
@@ -424,36 +432,35 @@
 
 void usage(char *progname)
 {
-	printf("USAGE: %s <Inputfile (BMP|PPM))> <%% Quality>\n", progname);
-	printf("       %s <Inputfile (JPG))>\n\n", progname);
-	printf("       [-tile]\n");
-	printf("       Test performance of the codec when the image is encoded\n");
-	printf("       as separate tiles of varying sizes.\n\n");
-	printf("       [-forcemmx] [-forcesse] [-forcesse2] [-forcesse3]\n");
-	printf("       Force MMX, SSE, SSE2, or SSE3 code paths in the underlying codec\n\n");
-	printf("       [-rgb | -bgr | -rgbx | -bgrx | -xbgr | -xrgb]\n");
-	printf("       Test the specified color conversion path in the codec (default: BGR)\n\n");
-	printf("       [-fastupsample]\n");
-	printf("       Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
-	printf("       YUV decoding in libjpeg decompressor\n\n");
-	printf("       [-quiet]\n");
-	printf("       Output in tabular rather than verbose format\n\n");
-	printf("       [-yuvencode]\n");
-	printf("       Encode RGB input as planar YUV rather than compressing as JPEG\n\n");
-	printf("       [-yuvdecode]\n");
-	printf("       Decode JPEG image to planar YUV rather than RGB\n\n");
-	printf("       NOTE: If the quality is specified as a range, i.e. 90-100, a separate\n");
-	printf("       test will be performed for all quality values in the range.\n");
+	printf("USAGE: %s\n", progname);
+	printf("       <Inputfile (BMP|PPM)> <%% Quality> [options]\n\n");
+	printf("       %s\n", progname);
+	printf("       <Inputfile (JPG)> [options]\n\n");
+	printf("Options:\n\n");
+	printf("-tile = Test performance of the codec when the image is encoded as separate\n");
+	printf("     tiles of varying sizes.\n");
+	printf("-forcemmx, -forcesse, -forcesse2, -forcesse3 =\n");
+	printf("     Force MMX, SSE, SSE2, or SSE3 code paths in the underlying codec\n");
+	printf("-rgb, -bgr, -rgbx, -bgrx, -xbgr, -xrgb =\n");
+	printf("     Test the specified color conversion path in the codec (default: BGR)\n");
+	printf("-fastupsample = Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
+	printf("     YUV decoding in libjpeg decompressor\n");
+	printf("-quiet = Output results in tabular rather than verbose format\n");
+	printf("-yuvencode = Encode RGB input as planar YUV rather than compressing as JPEG\n");
+	printf("-yuvdecode = Decode JPEG image to planar YUV rather than RGB\n");
+	printf("-scale 1/N = scale down the width/height of the decompressed JPEG image by a\n");
+	printf("     factor of N (N = 1, 2, 4, or 8}\n\n");
+	printf("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate\n");
+	printf("test will be performed for all quality values in the range.\n\n");
 	exit(1);
 }
 
 
 int main(int argc, char *argv[])
 {
-	unsigned char *bmpbuf=NULL;  int w, h, i, useppm=0;
-	int qual, dotile=0, quiet=0, hiqual=-1;  char *temp;
-	int pf=BMP_BGR;
-	int bu=0, minarg=2;
+	unsigned char *bmpbuf=NULL;  int w, h, i;
+	int qual, hiqual=-1;  char *temp;
+	int minarg=2;
 
 	if(argc<minarg) usage(argv[0]);
 
@@ -535,9 +542,24 @@
 			if(!stricmp(argv[i], "-bottomup")) bu=1;
 			if(!stricmp(argv[i], "-quiet")) quiet=1;
 			if(!stricmp(argv[i], "-qq")) quiet=2;
+			if(!stricmp(argv[i], "-scale") && i<argc-1)
+			{
+				int temp1=0, temp2=0;
+				if(sscanf(argv[++i], "%d/%d", &temp1, &temp2)!=2
+					|| temp1!=1 || temp2<1 || temp2>8 || (temp2&(temp2-1))!=0)
+					usage(argv[0]);
+				scalefactor=temp2;
+			}
 		}
 	}
 
+	if(scalefactor!=1 && dotile)
+	{
+		printf("Disabling tiled compression/decompression tests, because these tests do not\n");
+		printf("work when scaled decompression is enabled.\n");
+		dotile=0;
+	}
+
 	if(!decomponly)
 	{
 		if(loadbmp(argv[1], &bmpbuf, &w, &h, pf, 1, bu)==-1)
@@ -555,20 +577,20 @@
 
 	if(decomponly)
 	{
-		dodecomptest(argv[1], pf, bu, 1, quiet);
+		dodecomptest(argv[1]);
 		goto bailout;
 	}
 	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_GRAYSCALE, i, argv[1], dotile, useppm, quiet);
+		dotest(bmpbuf, w, h, TJ_GRAYSCALE, i, argv[1]);
 	if(quiet) printf("\n");
 	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_420, i, argv[1], dotile, useppm, quiet);
+		dotest(bmpbuf, w, h, TJ_420, i, argv[1]);
 	if(quiet) printf("\n");
 	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_422, i, argv[1], dotile, useppm, quiet);
+		dotest(bmpbuf, w, h, TJ_422, i, argv[1]);
 	if(quiet) printf("\n");
 	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_444, i, argv[1], dotile, useppm, quiet);
+		dotest(bmpbuf, w, h, TJ_444, i, argv[1]);
 
 	bailout:
 	if(bmpbuf) free(bmpbuf);
diff --git a/jsimd.h b/jsimd.h
index b663791..3fa2c43 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -2,6 +2,7 @@
  * jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2011 D. R. Commander
  * 
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,8 +14,10 @@
 
 #ifdef NEED_SHORT_EXTERNAL_NAMES
 #define jsimd_can_rgb_ycc                 jSCanRgbYcc
+#define jsimd_can_rgb_gray                jSCanRgbGry
 #define jsimd_can_ycc_rgb                 jSCanYccRgb
 #define jsimd_rgb_ycc_convert             jSRgbYccConv
+#define jsimd_rgb_gray_convert            jSRgbGryConv
 #define jsimd_ycc_rgb_convert             jSYccRgbConv
 #define jsimd_can_h2v2_downsample         jSCanH2V2Down
 #define jsimd_can_h2v1_downsample         jSCanH2V1Down
@@ -35,12 +38,17 @@
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 EXTERN(int) jsimd_can_rgb_ycc JPP((void));
+EXTERN(int) jsimd_can_rgb_gray JPP((void));
 EXTERN(int) jsimd_can_ycc_rgb JPP((void));
 
 EXTERN(void) jsimd_rgb_ycc_convert
         JPP((j_compress_ptr cinfo,
              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
              JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_rgb_gray_convert
+        JPP((j_compress_ptr cinfo,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
 EXTERN(void) jsimd_ycc_rgb_convert
         JPP((j_decompress_ptr cinfo,
              JSAMPIMAGE input_buf, JDIMENSION input_row,
diff --git a/jsimd_none.c b/jsimd_none.c
index 7ff3074..9787902 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -2,7 +2,7 @@
  * jsimd_none.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009 D. R. Commander
+ * Copyright 2009-2011 D. R. Commander
  * 
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -25,6 +25,12 @@
 }
 
 GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
 jsimd_can_ycc_rgb (void)
 {
   return 0;
@@ -38,6 +44,13 @@
 }
 
 GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
 jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
                        JSAMPIMAGE input_buf, JDIMENSION input_row,
                        JSAMPARRAY output_buf, int num_rows)
diff --git a/release/libjpeg-turbo.spec.in b/release/libjpeg-turbo.spec.in
index 48e161d..c21eef8 100644
--- a/release/libjpeg-turbo.spec.in
+++ b/release/libjpeg-turbo.spec.in
@@ -43,7 +43,7 @@
 #-->%setup -q
 
 #-->%build
-#-->configure libdir=/opt/%{name}/%{__lib} mandir=/opt/%{name}/man JPEG_LIB_VERSION=@JPEG_LIB_VERSION@ SO_MAJOR_VERSION=@SO_MAJOR_VERSION@ SO_MINOR_VERSION=@SO_MINOR_VERSION@ --with-pic
+#-->configure libdir=/opt/%{name}/%{__lib} mandir=/opt/%{name}/man JPEG_LIB_VERSION=@JPEG_LIB_VERSION@ SO_MAJOR_VERSION=@SO_MAJOR_VERSION@ SO_MINOR_VERSION=@SO_MINOR_VERSION@ --with-pic @RPM_CONFIG_ARGS@
 #-->make DESTDIR=$RPM_BUILD_ROOT libdir=/opt/%{name}/%{__lib} mandir=/opt/%{name}/man
 
 %install
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 49931d9..c7bbbcf 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -28,6 +28,7 @@
 BUILD=@BUILD@
 SRCDIR=@srcdir@
 BUILDDIR32=@srcdir@/osxx86
+BUILDJNILIB=@BUILDJNILIB@
 if [ $# -gt 0 ]; then
 	if [ "$1" = "universal" ]; then
 		UNIVERSAL=1
@@ -122,6 +123,9 @@
 
 ln -fs /usr/include/turbojpeg.h $PKGROOT/opt/$PACKAGE_NAME/include/
 ln -fs /usr/lib/libturbojpeg.a $PKGROOT/opt/$PACKAGE_NAME/lib/
+if [ $BUILDJNILIB = 1 ]; then
+	ln -fs libturbojpeg.dylib $PKGROOT/usr/lib/libturbojpeg.jnilib
+fi
 if [ ! -h $PKGROOT/opt/$PACKAGE_NAME/lib32 ]; then
 	ln -fs lib $PKGROOT/opt/$PACKAGE_NAME/lib32
 fi
diff --git a/simd/Makefile.am b/simd/Makefile.am
index 81c23af..f19fdf5 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -11,7 +11,7 @@
 libsimd_la_SOURCES = jsimd_x86_64.c \
 	jsimd.h jsimdcfg.inc.h \
 	jsimdext.inc jcolsamp.inc jdct.inc \
-	jfsseflt-64.asm \
+	jfsseflt-64.asm jcgrass2-64.asm \
 	jccolss2-64.asm jdcolss2-64.asm \
 	jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \
 	jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \
@@ -20,6 +20,7 @@
 
 jccolss2-64.lo: jcclrss2-64.asm
 jdcolss2-64.lo: jdclrss2-64.asm
+jcgrass2-64.lo: jcgryss2-64.asm
 jdmerss2-64.lo: jdmrgss2-64.asm
 endif
 
@@ -29,20 +30,22 @@
 	jsimd.h jsimdcfg.inc.h \
 	jsimdext.inc jcolsamp.inc jdct.inc \
 	jsimdcpu.asm \
-	jccolmmx.asm jdcolmmx.asm \
+	jccolmmx.asm jdcolmmx.asm jcgrammx.asm \
 	jcsammmx.asm jdsammmx.asm jdmermmx.asm \
 	jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \
 	jimmxred.asm jimmxint.asm jimmxfst.asm \
 	jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \
 	jcqntsse.asm jfsseflt.asm jisseflt.asm \
-	jccolss2.asm jdcolss2.asm \
+	jccolss2.asm jdcolss2.asm jcgrass2.asm \
 	jcsamss2.asm jdsamss2.asm jdmerss2.asm \
 	jcqnts2i.asm jfss2fst.asm jfss2int.asm \
 	jiss2red.asm jiss2int.asm jiss2fst.asm \
 	jcqnts2f.asm jiss2flt.asm
 
 jccolmmx.lo: jcclrmmx.asm
+jcgrammx.lo: jcgrymmx.asm
 jccolss2.lo: jcclrss2.asm
+jcgrass2.lo: jcgryss2.asm
 jdcolmmx.lo: jdclrmmx.asm
 jdcolss2.lo: jdclrss2.asm
 jdmermmx.lo: jdmrgmmx.asm
diff --git a/simd/jcgrammx.asm b/simd/jcgrammx.asm
new file mode 100644
index 0000000..dd46cc5
--- /dev/null
+++ b/simd/jcgrammx.asm
@@ -0,0 +1,113 @@
+;
+; jcgrammx.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2011 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_gray_convert_mmx)
+
+EXTN(jconst_rgb_gray_convert_mmx):
+
+PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
+PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcgrymmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
+%include "jcgrymmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
+%include "jcgrymmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
+%include "jcgrymmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
+%include "jcgrymmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
+%include "jcgrymmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
+%include "jcgrymmx.asm"
diff --git a/simd/jcgrass2-64.asm b/simd/jcgrass2-64.asm
new file mode 100644
index 0000000..9f8a01a
--- /dev/null
+++ b/simd/jcgrass2-64.asm
@@ -0,0 +1,110 @@
+;
+; jcgrass2-64.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2011, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcgryss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
+%include "jcgryss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
+%include "jcgryss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
+%include "jcgryss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
+%include "jcgryss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
+%include "jcgryss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
+%include "jcgryss2-64.asm"
diff --git a/simd/jcgrass2.asm b/simd/jcgrass2.asm
new file mode 100644
index 0000000..f284e0f
--- /dev/null
+++ b/simd/jcgrass2.asm
@@ -0,0 +1,110 @@
+;
+; jcgrass2.asm - grayscale colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2011, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcgryss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
+%include "jcgryss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
+%include "jcgryss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
+%include "jcgryss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
+%include "jcgryss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
+%include "jcgryss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
+%include "jcgryss2.asm"
diff --git a/simd/jcgrymmx.asm b/simd/jcgrymmx.asm
new file mode 100644
index 0000000..93d0936
--- /dev/null
+++ b/simd/jcgrymmx.asm
@@ -0,0 +1,359 @@
+;
+; jcclrmmx.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2011 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	xor	eax,eax
+	mov	al, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	xor	edx,edx
+	mov	dx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	mmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	mmG, DWORD [esi+ecx]
+	psllq	mmA, DWORD_BIT
+	por	mmA,mmG
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	movq	mmG,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	mov	ecx, SIZEOF_MMWORD
+	jmp	short .rgb_gray_cnv
+.column_ld16:
+	test	cl, 2*SIZEOF_MMWORD
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_gray_cnv
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_gray_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+	; mmA=(00 10 20 01 11 21 02 12)
+	; mmG=(22 03 13 23 04 14 24 05)
+	; mmF=(15 25 06 16 26 07 17 27)
+
+	movq      mmD,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
+	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
+
+	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
+	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
+
+	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
+	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
+
+	movq      mmE,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
+	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
+
+	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
+
+	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
+	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
+
+	pxor      mmH,mmH
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
+
+	movq      mmB,mmE
+	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
+	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
+
+	movq      mmF,mmD
+	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
+	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_MMWORD/8
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_MMWORD/8
+	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_MMWORD/4
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_MMWORD/4
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+	test	cl, SIZEOF_MMWORD/2
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_gray_cnv
+	movq	mmD,mmA
+	movq	mmC,mmF
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_gray_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+	; mmA=(00 10 20 30 01 11 21 31)
+	; mmF=(02 12 22 32 03 13 23 33)
+	; mmD=(04 14 24 34 05 15 25 35)
+	; mmC=(06 16 26 36 07 17 27 37)
+
+	movq      mmB,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
+	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
+
+	movq      mmG,mmD
+	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
+	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
+
+	movq      mmE,mmA
+	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
+
+	movq      mmH,mmB
+	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
+	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
+
+	pxor      mmF,mmF
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
+
+	movq      mmD,mmB
+	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
+	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
+
+	movq      mmG,mmE
+	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
+	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
+
+	punpcklbw mmF,mmH
+	punpckhbw mmH,mmH
+	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
+	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+	movq      mm6,mm1
+	punpcklwd mm1,mm3
+	punpckhwd mm6,mm3
+	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	movq      mm7, mm6	; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	movq      mm6,mm0
+	punpcklwd mm0,mm2
+	punpckhwd mm6,mm2
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	movq      MMWORD [wk(0)], mm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movq      MMWORD [wk(1)], mm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	movq      mm0, mm5	; mm0=BO
+	movq      mm6, mm4	; mm6=BE
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm3
+	punpckhwd mm4,mm3
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
+
+	paddd     mm0, mm1
+	paddd     mm4, mm7
+	paddd     mm0,mm3
+	paddd     mm4,mm3
+	psrld     mm0,SCALEBITS		; mm0=YOL
+	psrld     mm4,SCALEBITS		; mm4=YOH
+	packssdw  mm0,mm4		; mm0=YO
+
+	movq      mm4,mm6
+	punpcklwd mm6,mm2
+	punpckhwd mm4,mm2
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
+
+	paddd     mm6, MMWORD [wk(0)]
+	paddd     mm4, MMWORD [wk(1)]
+	paddd     mm6,mm2
+	paddd     mm4,mm2
+	psrld     mm6,SCALEBITS		; mm6=YEL
+	psrld     mm4,SCALEBITS		; mm4=YEH
+	packssdw  mm6,mm4		; mm6=YE
+
+	psllw     mm0,BYTE_BIT
+	por       mm6,mm0		; mm6=Y
+	movq      MMWORD [edi], mm6	; Save Y
+
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
+	add	edi, byte SIZEOF_MMWORD			; outptr0
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/simd/jcgryss2-64.asm b/simd/jcgryss2-64.asm
new file mode 100644
index 0000000..3a52ec2
--- /dev/null
+++ b/simd/jcgryss2-64.asm
@@ -0,0 +1,366 @@
+;
+; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2011, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
+;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                              JDIMENSION output_row, int num_rows);
+;
+
+; r10 = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13 = JDIMENSION output_row
+; r14 = int num_rows
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+
+	global	EXTN(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+	push	rbx
+
+	mov	rcx, r10
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov rsi, r12
+	mov rcx, r13
+	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+	pop	rcx
+
+	mov rsi, r11
+	mov	eax, r14d
+	test	rax,rax
+	jle	near .return
+.rowloop:
+	push	rdi
+	push	rsi
+	push	rcx			; col
+
+	mov	rsi, JSAMPROW [rsi]	; inptr
+	mov	rdi, JSAMPROW [rdi]	; outptr0
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	rax
+	push	rdx
+	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	rcx, byte SIZEOF_BYTE
+	movzx	rax, BYTE [rsi+rcx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	rcx, byte SIZEOF_WORD
+	movzx	rdx, WORD [rsi+rcx]
+	shl	rax, WORD_BIT
+	or	rax,rdx
+.column_ld4:
+	movd	xmmA,eax
+	pop	rdx
+	pop	rax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	rcx, byte SIZEOF_DWORD
+	movd	xmmF, XMM_DWORD [rsi+rcx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	rcx, byte SIZEOF_MMWORD
+	movq	xmmB, XMM_MMWORD [rsi+rcx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .rgb_gray_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	rcx, SIZEOF_XMMWORD
+	jz	short .rgb_gray_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_gray_cnv
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	rcx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	rcx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	rcx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	rcx, SIZEOF_XMMWORD
+	jz	short .rgb_gray_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_gray_cnv
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	movdqa    xmm7, xmm6	; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(1)], xmm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	movdqa    xmm0, xmm5	; xmm0=BO
+	movdqa    xmm6, xmm4	; xmm6=BE
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, xmm1
+	paddd     xmm4, xmm7
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(0)]
+	paddd     xmm4, XMMWORD [wk(1)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [rdi], xmm6	; Save Y
+
+	sub	rcx, byte SIZEOF_XMMWORD
+	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte SIZEOF_XMMWORD		; outptr0
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	rcx,rcx
+	jnz	near .column_ld1
+
+	pop	rcx			; col
+	pop	rsi
+	pop	rdi
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
+	add	rdi, byte SIZEOF_JSAMPROW
+	dec	rax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	rbx
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/simd/jcgryss2.asm b/simd/jcgryss2.asm
new file mode 100644
index 0000000..6eac030
--- /dev/null
+++ b/simd/jcgryss2.asm
@@ -0,0 +1,385 @@
+;
+; jcgryss2.asm - grayscale colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2011, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
+;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                              JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+
+	global	EXTN(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	movzx	eax, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	movzx	edx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	xmmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	xmmF, XMM_DWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	ecx, byte SIZEOF_MMWORD
+	movq	xmmB, XMM_MMWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .rgb_gray_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_gray_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_gray_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_gray_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_gray_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	movdqa    xmm7, xmm6	; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(1)], xmm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	movdqa    xmm0, xmm5	; xmm0=BO
+	movdqa    xmm6, xmm4	; xmm6=BE
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, xmm1
+	paddd     xmm4, xmm7
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(0)]
+	paddd     xmm4, XMMWORD [wk(1)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [edi], xmm6	; Save Y
+
+	sub	ecx, byte SIZEOF_XMMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	edi, byte SIZEOF_XMMWORD		; outptr0
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 89ac1b7..7bfdd17 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,6 +2,7 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2011 D. R. Commander
  * 
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -43,6 +44,14 @@
 #define jsimd_extbgrx_ycc_convert_sse2        jSEXTBGRXYCCS2
 #define jsimd_extxbgr_ycc_convert_sse2        jSEXTXBGRYCCS2
 #define jsimd_extxrgb_ycc_convert_sse2        jSEXTXRGBYCCS2
+#define jconst_rgb_gray_convert_sse2          jSCRGBGRYS2
+#define jsimd_rgb_gray_convert_sse2           jSRGBGRYS2
+#define jsimd_extrgb_gray_convert_sse2        jSEXTRGBGRYS2
+#define jsimd_extrgbx_gray_convert_sse2       jSEXTRGBXGRYS2
+#define jsimd_extbgr_gray_convert_sse2        jSEXTBGRGRYS2
+#define jsimd_extbgrx_gray_convert_sse2       jSEXTBGRXGRYS2
+#define jsimd_extxbgr_gray_convert_sse2       jSEXTXBGRGRYS2
+#define jsimd_extxrgb_gray_convert_sse2       jSEXTXRGBGRYS2
 #define jconst_ycc_rgb_convert_sse2           jSCYCCRGBS2
 #define jsimd_ycc_rgb_convert_sse2            jSYCCRGBS2
 #define jsimd_ycc_extrgb_convert_sse2         jSYCCEXTRGBS2
@@ -163,6 +172,35 @@
              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
              JDIMENSION output_row, int num_rows));
 
+EXTERN(void) jsimd_rgb_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
 EXTERN(void) jsimd_ycc_rgb_convert_mmx
         JPP((JDIMENSION out_width,
              JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -222,6 +260,36 @@
              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
              JDIMENSION output_row, int num_rows));
 
+extern const int jconst_rgb_gray_convert_sse2[];
+EXTERN(void) jsimd_rgb_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
 extern const int jconst_ycc_rgb_convert_sse2[];
 EXTERN(void) jsimd_ycc_rgb_convert_sse2
         JPP((JDIMENSION out_width,
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
index d9bb774..f77c5ef 100644
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -2,7 +2,7 @@
  * jsimd_i386.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009 D. R. Commander
+ * Copyright 2009-2011 D. R. Commander
  * 
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -84,6 +84,28 @@
 }
 
 GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
 jsimd_can_ycc_rgb (void)
 {
   init_simd();
@@ -155,6 +177,55 @@
 }
 
 GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_extrgb_gray_convert_sse2;
+      mmxfct=jsimd_extrgb_gray_convert_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_extrgbx_gray_convert_sse2;
+      mmxfct=jsimd_extrgbx_gray_convert_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_extbgr_gray_convert_sse2;
+      mmxfct=jsimd_extbgr_gray_convert_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_extbgrx_gray_convert_sse2;
+      mmxfct=jsimd_extbgrx_gray_convert_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_extxbgr_gray_convert_sse2;
+      mmxfct=jsimd_extxbgr_gray_convert_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_extxrgb_gray_convert_sse2;
+      mmxfct=jsimd_extxrgb_gray_convert_mmx;
+      break;
+    default:
+      sse2fct=jsimd_rgb_gray_convert_sse2;
+      mmxfct=jsimd_rgb_gray_convert_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    sse2fct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
 jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
                        JSAMPIMAGE input_buf, JDIMENSION input_row,
                        JSAMPARRAY output_buf, int num_rows)
diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
index 7659249..2951268 100644
--- a/simd/jsimd_x86_64.c
+++ b/simd/jsimd_x86_64.c
@@ -2,7 +2,7 @@
  * jsimd_x86_64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009 D. R. Commander
+ * Copyright 2009-2011 D. R. Commander
  * 
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -47,6 +47,23 @@
 }
 
 GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
 jsimd_can_ycc_rgb (void)
 {
   /* The code is optimised for these values only */
@@ -99,6 +116,41 @@
 }
 
 GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_extrgb_gray_convert_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_extrgbx_gray_convert_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_extbgr_gray_convert_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_extbgrx_gray_convert_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_extxbgr_gray_convert_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_extxrgb_gray_convert_sse2;
+      break;
+    default:
+      sse2fct=jsimd_rgb_gray_convert_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
 jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
                        JSAMPIMAGE input_buf, JDIMENSION input_row,
                        JSAMPARRAY output_buf, int num_rows)
diff --git a/testimggray.jpg b/testimggray.jpg
new file mode 100644
index 0000000..95505a2
--- /dev/null
+++ b/testimggray.jpg
Binary files differ
diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
new file mode 100644
index 0000000..1e0a353
--- /dev/null
+++ b/turbojpeg-jni.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "turbojpeg.h"
+#include <jni.h>
+#include "java/org_libjpegturbo_turbojpeg_TJCompressor.h"
+#include "java/org_libjpegturbo_turbojpeg_TJDecompressor.h"
+#include "java/org_libjpegturbo_turbojpeg_TJ.h"
+
+#define _throw(msg) {  \
+	jclass _exccls=(*env)->FindClass(env, "java/lang/Exception");  \
+	if(!_exccls) goto bailout;  \
+	(*env)->ThrowNew(env, _exccls, msg);  \
+	goto bailout;  \
+}
+
+#define bailif0(f) {if(!(f)) goto bailout;}
+
+#define gethandle() {  \
+	jclass _cls=(*env)->GetObjectClass(env, obj);  \
+	jfieldID _fid;  \
+	if(!_cls) goto bailout;  \
+	bailif0(_fid=(*env)->GetFieldID(env, _cls, "handle", "J"));  \
+	handle=(tjhandle)(long)(*env)->GetLongField(env, obj, _fid);  \
+}
+
+JNIEXPORT jlong JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSize
+	(JNIEnv *env, jclass cls, jint width, jint height)
+{
+	jlong retval=TJBUFSIZE(width, height);
+	if(retval==-1) _throw(tjGetErrorStr());
+
+	bailout:
+	return retval;
+}
+
+JNIEXPORT jlong JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV
+	(JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
+{
+	jlong retval=TJBUFSIZEYUV(width, height, subsamp);
+	if(retval==-1) _throw(tjGetErrorStr());
+
+	bailout:
+	return retval;
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
+	(JNIEnv *env, jobject obj)
+{
+	jclass cls;
+	jfieldID fid;
+	tjhandle handle;
+
+  if((handle=tjInitCompress())==NULL)
+		_throw(tjGetErrorStr());
+
+	bailif0(cls=(*env)->GetObjectClass(env, obj));
+	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
+	(*env)->SetLongField(env, obj, fid, (long)handle);
+
+	bailout:
+	return;
+}
+
+JNIEXPORT jlong JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress
+	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
+		jint height, jint pixelsize, jbyteArray dst, jint jpegsubsamp,
+		jint jpegqual, jint flags)
+{
+	tjhandle handle=0;
+	unsigned long size=0;
+	unsigned char *srcbuf=NULL, *dstbuf=NULL;
+
+	gethandle();
+
+	bailif0(srcbuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+	bailif0(dstbuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+	if(tjCompress(handle, srcbuf, width, pitch, height, pixelsize, dstbuf,
+		&size, jpegsubsamp, jpegqual, flags)==-1)
+	{
+		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstbuf, 0);
+		(*env)->ReleasePrimitiveArrayCritical(env, src, srcbuf, 0);
+		_throw(tjGetErrorStr());
+	}
+
+	bailout:
+	if(dstbuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstbuf, 0);
+	if(srcbuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcbuf, 0);
+	return size;
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
+	(JNIEnv *env, jobject obj)
+{
+	tjhandle handle=0;
+
+	gethandle();
+
+	if(tjDestroy(handle)==-1) _throw(tjGetErrorStr());
+
+	bailout:
+	return;
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_init
+	(JNIEnv *env, jobject obj)
+{
+	jclass cls;
+	jfieldID fid;
+	tjhandle handle;
+
+  if((handle=tjInitDecompress())==NULL) _throw(tjGetErrorStr());
+
+	bailif0(cls=(*env)->GetObjectClass(env, obj));
+	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
+	(*env)->SetLongField(env, obj, fid, (long)handle);
+
+	bailout:
+	return;
+}
+
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_getScaledWidth
+	(JNIEnv *env, jobject obj, jint input_width, jint input_height,
+		jint output_width, jint output_height)
+{
+	if(tjScaledSize(input_width, input_height, &output_width, &output_height)
+		==-1)
+		_throw(tjGetErrorStr());
+
+	bailout:
+	return output_width;
+}
+
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_getScaledHeight
+	(JNIEnv *env, jobject obj, jint input_width, jint input_height,
+		jint output_width, jint output_height)
+{
+	if(tjScaledSize(input_width, input_height, &output_width, &output_height)
+		==-1)
+		_throw(tjGetErrorStr());
+
+	bailout:
+	return output_height;
+}
+
+JNIEXPORT jobject JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader
+	(JNIEnv *env, jobject obj, jbyteArray src, jlong size)
+{
+	jclass jhicls=NULL;
+	jfieldID fid;
+	tjhandle handle=0;
+	unsigned char *srcbuf=NULL;
+	int width=0, height=0, jpegsubsamp=-1;
+	jobject jhiobj=NULL;
+
+	gethandle();
+
+	bailif0(srcbuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+	if(tjDecompressHeader2(handle, srcbuf, (unsigned long)size, 
+		&width, &height, &jpegsubsamp)==-1)
+	{
+		(*env)->ReleasePrimitiveArrayCritical(env, src, srcbuf, 0);
+		_throw(tjGetErrorStr());
+	}
+	(*env)->ReleasePrimitiveArrayCritical(env, src, srcbuf, 0);  srcbuf=NULL;
+
+	bailif0(jhicls=(*env)->FindClass(env, "org/libjpegturbo/turbojpeg/TJHeaderInfo"));
+	bailif0(jhiobj=(*env)->AllocObject(env, jhicls));
+
+	bailif0(fid=(*env)->GetFieldID(env, jhicls, "subsamp", "I"));
+	(*env)->SetIntField(env, jhiobj, fid, jpegsubsamp);
+	bailif0(fid=(*env)->GetFieldID(env, jhicls, "width", "I"));
+	(*env)->SetIntField(env, jhiobj, fid, width);
+	bailif0(fid=(*env)->GetFieldID(env, jhicls, "height", "I"));
+	(*env)->SetIntField(env, jhiobj, fid, height);
+
+	bailout:
+	return jhiobj;
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
+	(JNIEnv *env, jobject obj, jbyteArray src, jlong size, jbyteArray dst,
+		jint width, jint pitch, jint height, jint pixelsize, jint flags)
+{
+	tjhandle handle=0;
+	unsigned char *srcbuf=NULL, *dstbuf=NULL;
+
+	gethandle();
+
+	bailif0(srcbuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+	bailif0(dstbuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+	if(tjDecompress(handle, srcbuf, (unsigned long)size, dstbuf, width, pitch,
+		height, pixelsize, flags)==-1)
+	{
+		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstbuf, 0);
+		(*env)->ReleasePrimitiveArrayCritical(env, src, srcbuf, 0);
+		_throw(tjGetErrorStr());
+	}
+
+	bailout:
+	if(dstbuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstbuf, 0);
+	if(srcbuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcbuf, 0);
+	return;
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy
+	(JNIEnv *env, jobject obj)
+{
+	Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy(env, obj);
+}
diff --git a/turbojpeg-mapfile b/turbojpeg-mapfile
index e54d59c..7fc4a2d 100755
--- a/turbojpeg-mapfile
+++ b/turbojpeg-mapfile
@@ -3,9 +3,11 @@
 		tjInitCompress;
 		tjCompress;
 		TJBUFSIZE;
+		TJBUFSIZEYUV;
 		tjInitDecompress;
 		tjDecompressHeader;
 		tjDecompressHeader2;
+		tjScaledSize;
 		tjDecompress;
 		tjDestroy;
 		tjGetErrorStr;
diff --git a/turbojpeg-mapfile.jni b/turbojpeg-mapfile.jni
new file mode 100755
index 0000000..bf2ce4b
--- /dev/null
+++ b/turbojpeg-mapfile.jni
@@ -0,0 +1,27 @@
+{
+	global:
+		tjInitCompress;
+		tjCompress;
+		TJBUFSIZE;
+		TJBUFSIZEYUV;
+		tjInitDecompress;
+		tjDecompressHeader;
+		tjDecompressHeader2;
+		tjScaledSize;
+		tjDecompress;
+		tjDestroy;
+		tjGetErrorStr;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSize;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_init;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_init;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy;		
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_getScaledHeight;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_getScaledWidth;
+	local:
+		*;
+};
diff --git a/turbojpeg.h b/turbojpeg.h
index 47e5aa6..5fa91f5 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -153,8 +153,31 @@
 	unsigned char *dstbuf, unsigned long *size,
 	int jpegsubsamp, int jpegqual, int flags);
 
+
+/*
+  unsigned long TJBUFSIZE(int width, int height)
+
+  Convenience function which returns the maximum size of the buffer required to
+  hold a JPEG image with the given width and height
+
+  RETURNS: -1 if arguments are out of bounds
+*/
 DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height);
 
+
+/*
+  unsigned long TJBUFSIZEYUV(int width, int height, int subsamp)
+
+  Convenience function which returns the size of the buffer required to
+  hold a YUV planar image with the given width, height, and level of
+  chrominance subsampling
+
+  RETURNS: -1 if arguments are out of bounds
+*/
+DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
+  int subsamp);
+
+
 /*
   tjhandle tjInitDecompress(void)
 
@@ -176,8 +199,7 @@
 
   [INPUT] j = instance handle previously returned from a call to
      tjInitDecompress()
-  [INPUT] srcbuf = pointer to a user-allocated buffer containing the JPEG image
-     to decompress
+  [INPUT] srcbuf = pointer to a user-allocated buffer containing a JPEG image
   [INPUT] size = size of the JPEG image buffer (in bytes)
   [OUTPUT] width = width (in pixels) of the JPEG image
   [OUTPUT] height = height (in pixels) of the JPEG image
@@ -199,6 +221,26 @@
 
 
 /*
+  int tjScaledSize(int input_width, int input_height,
+     int *output_width, int *output_height)
+
+  [INPUT] input_width = width (in pixels) of the JPEG image
+  [INPUT] input_height = height (in pixels) of the JPEG image
+  [INPUT/OUTPUT] output_width, output_height = Before calling this function,
+     *output_width and *output_height should be set to the desired dimensions
+     of the output image.  Upon returning from this function, they will be set
+     to the dimensions of the largest scaled down image that TurboJPEG can
+     produce without exceeding the desired dimensions.  If either *output_width
+     or *output_height is set to 0, then the corresponding dimension will not
+     be considered when determining the scaled image size.
+
+  RETURNS: 0 on success, -1 if arguments are out of bounds
+*/
+DLLEXPORT int DLLCALL tjScaledSize(int input_width, int input_height,
+	int *output_width, int *output_height);
+
+
+/*
   int tjDecompress(tjhandle j,
      unsigned char *srcbuf, unsigned long size,
      unsigned char *dstbuf, int width, int pitch, int height, int pixelsize,
@@ -210,22 +252,38 @@
      to decompress
   [INPUT] size = size of the JPEG image buffer (in bytes)
   [INPUT] dstbuf = pointer to user-allocated image buffer which will receive
-     the bitmap image.  This buffer should normally be pitch*height
-     bytes in size, although this pointer may also be used to decompress into
-     a specific region of a larger buffer.
-  [INPUT] width =  width (in pixels) of the destination image
-  [INPUT] pitch = bytes per line of the destination image (width*pixelsize if
-     the bitmap is unpadded, else TJPAD(width*pixelsize) if each line of the
-     bitmap is padded to the nearest 32-bit boundary, such as is the case for
-     Windows bitmaps.  You can also be clever and use this parameter to skip
-     lines, etc.  Setting this parameter to 0 is the equivalent of setting it
-     to width*pixelsize.
-  [INPUT] height = height (in pixels) of the destination image
+     the bitmap image.  This buffer should normally be pitch*scaled_height
+     bytes in size, where scaled_height is determined by calling
+     tjScaledSize() with the height of the desired output image.  This pointer
+     may also be used to decompress into a specific region of a
+     larger buffer.
+  [INPUT] width = desired width (in pixels) of the destination image.  If this
+     is smaller than the width of the JPEG image being decompressed, then
+     TurboJPEG will use scaling in the JPEG decompressor to generate the
+     largest possible image that will fit within the desired width.  If width
+     is set to 0, then only the height will be considered when determining the
+     scaled image size.
+  [INPUT] pitch = bytes per line of the destination image.  Normally, this is
+     scaled_width*pixelsize if the bitmap image is unpadded, else
+     TJPAD(scaled_width*pixelsize) if each line of the bitmap is padded to the
+     nearest 32-bit boundary, such as is the case for Windows bitmaps.
+     (NOTE: scaled_width can be determined by calling tjScaledSize().)  You can
+     also be clever and use this parameter to skip lines, etc.  Setting this
+     parameter to 0 is the equivalent of setting it to scaled_width*pixelsize.
+  [INPUT] height = desired height (in pixels) of the destination image.  If
+     this is smaller than the height of the JPEG image being decompressed, then
+     TurboJPEG will use scaling in the JPEG decompressor to generate the
+     largest possible image that will fit within the desired height.  If
+     height is set to 0, then only the width will be considered when
+     determining the scaled image size.
   [INPUT] pixelsize = size (in bytes) of each pixel in the destination image
      RGBX/BGRX/XRGB/XBGR: 4, RGB/BGR: 3, Grayscale: 1
   [INPUT] flags = the bitwise OR of one or more of the flags described in the
      "Flags" section above.
 
+  NOTE: The width, pitch, height, and pixelsize parameters are ignored if
+  decompressing to a YUV planar image.
+
   RETURNS: 0 on success, -1 on error
 */
 DLLEXPORT int DLLCALL tjDecompress(tjhandle j,
diff --git a/turbojpegl.c b/turbojpegl.c
index c1c62f6..93f4b6a 100644
--- a/turbojpegl.c
+++ b/turbojpegl.c
@@ -114,8 +114,33 @@
 
 DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height)
 {
-	// This allows enough room in case the image doesn't compress
-	return ((width+15)&(~15)) * ((height+15)&(~15)) * 6 + 2048;
+	unsigned long retval=0;
+	if(width<1 || height<1)
+		_throw("Invalid argument in TJBUFSIZE()");
+
+	// This allows for rare corner cases in which a JPEG image can actually be
+	// larger than the uncompressed input (we wouldn't mention it if it hadn't
+	// happened before.)
+	retval=((width+15)&(~15)) * ((height+15)&(~15)) * 6 + 2048;
+
+	bailout:
+	return retval;
+}
+
+DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
+	int subsamp)
+{
+	unsigned long retval=0;
+	int pw, ph, cw, ch;
+	if(width<1 || height<1 || subsamp<0 || subsamp>=NUMSUBOPT)
+		_throw("Invalid argument in TJBUFSIZEYUV()");
+	pw=PAD(width, hsampfactor[subsamp]);
+	ph=PAD(height, vsampfactor[subsamp]);
+	cw=pw/hsampfactor[subsamp];  ch=ph/vsampfactor[subsamp];
+	retval=PAD(pw, 4)*ph + (subsamp==TJ_GRAYSCALE? 0:PAD(cw, 4)*ch*2);
+
+	bailout:
+	return retval;
 }
 
 DLLEXPORT int DLLCALL tjCompress(tjhandle h,
@@ -294,6 +319,7 @@
 			-(unsigned long)(j->jdms.free_in_buffer);
 
 	bailout:
+	if(j->cinfo.global_state>CSTATE_START) jpeg_abort_compress(&j->cinfo);
 	if(row_pointer) free(row_pointer);
 	for(i=0; i<MAX_COMPONENTS; i++)
 	{
@@ -418,6 +444,34 @@
 }
 
 
+DLLEXPORT int DLLCALL tjScaledSize(int input_width, int input_height,
+	int *output_width, int *output_height)
+{
+	int i, retval=0, scaledw=0, scaledh=0;
+
+	if(input_width<1 || input_height<1 || output_width==NULL
+		|| output_height==NULL || *output_width<0 || *output_height<0)
+		_throw("Invalid argument in tjScaledSize()");
+
+	if(*output_width==0) *output_width=input_width;
+	if(*output_height==0) *output_height=input_height;
+	if(*output_width<input_width || *output_height<input_height)
+	{
+		for(i=1; i<=8; i*=2)
+		{
+			scaledw=(input_width+i-1)/i;
+			scaledh=(input_height+i-1)/i;
+			if(scaledw<=*output_width && scaledh<=*output_height)
+				break;
+		}
+		*output_width=scaledw;  *output_height=scaledh;
+	}
+
+	bailout:
+	return retval;
+}
+
+
 DLLEXPORT int DLLCALL tjDecompress(tjhandle h,
 	unsigned char *srcbuf, unsigned long size,
 	unsigned char *dstbuf, int width, int pitch, int height, int ps,
@@ -427,6 +481,7 @@
 	int cw[MAX_COMPONENTS], ch[MAX_COMPONENTS], iw[MAX_COMPONENTS],
 		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
 	JSAMPLE *_tmpbuf=NULL;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
+	int scale_num=1, scale_denom=1, jpegwidth, jpegheight, scaledw, scaledh;
 
 	checkhandle(h);
 
@@ -436,14 +491,12 @@
 	}
 
 	if(srcbuf==NULL || size<=0
-		|| dstbuf==NULL || width<=0 || pitch<0 || height<=0)
+		|| dstbuf==NULL || width<0 || pitch<0 || height<0)
 		_throw("Invalid argument in tjDecompress()");
 	if(ps!=3 && ps!=4 && ps!=1)
 		_throw("This decompressor can only handle 24-bit and 32-bit RGB or 8-bit grayscale output");
 	if(!j->initd) _throw("Instance has not been initialized for decompression");
 
-	if(pitch==0) pitch=width*ps;
-
 	if(flags&TJ_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
 	else if(flags&TJ_FORCESSE) putenv("JSIMD_FORCESSE=1");
 	else if(flags&TJ_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
@@ -459,6 +512,24 @@
 
 	jpeg_read_header(&j->dinfo, TRUE);
 
+	jpegwidth=j->dinfo.image_width;  jpegheight=j->dinfo.image_height;
+	if(width==0) width=jpegwidth;
+	if(height==0) height=jpegheight;
+	if(width<jpegwidth || height<jpegheight)
+	{
+		for(i=1; i<=8; i*=2)
+		{
+			scaledw=(jpegwidth+i-1)/i;
+			scaledh=(jpegheight+i-1)/i;
+			if(scaledw<=width && scaledh<=height)
+				break;
+		}
+		if(scaledw>width || scaledh>height)
+			_throw("Could not scale down to desired image dimensions");
+		width=scaledw;  height=scaledh;
+		scale_denom=i;
+	}
+
 	if(flags&TJ_YUV)
 	{
 		j_decompress_ptr dinfo=&j->dinfo;
@@ -470,10 +541,10 @@
 			int ih;
 			iw[i]=compptr->width_in_blocks*DCTSIZE;
 			ih=compptr->height_in_blocks*DCTSIZE;
-			cw[i]=PAD(width, dinfo->max_h_samp_factor)*compptr->h_samp_factor
-				/dinfo->max_h_samp_factor;
-			ch[i]=PAD(height, dinfo->max_v_samp_factor)*compptr->v_samp_factor
-				/dinfo->max_v_samp_factor;
+			cw[i]=PAD(dinfo->image_width, dinfo->max_h_samp_factor)
+				*compptr->h_samp_factor/dinfo->max_h_samp_factor;
+			ch[i]=PAD(dinfo->image_height, dinfo->max_v_samp_factor)
+				*compptr->v_samp_factor/dinfo->max_v_samp_factor;
 			if(iw[i]!=cw[i] || ih!=ch[i]) usetmpbuf=1;
 			th[i]=compptr->v_samp_factor*DCTSIZE;
 			tmpbufsize+=iw[i]*th[i];
@@ -503,16 +574,6 @@
 			}
 		}
 	}
-	else
-	{
-		if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
-			_throw("Memory allocation failed in tjInitDecompress()");
-		for(i=0; i<height; i++)
-		{
-			if(flags&TJ_BOTTOMUP) row_pointer[i]= &dstbuf[(height-i-1)*pitch];
-			else row_pointer[i]= &dstbuf[i*pitch];
-		}
-	}
 
 	if(ps==1) j->dinfo.out_color_space = JCS_GRAYSCALE;
 	#if JCS_EXTENSIONS==1
@@ -533,6 +594,11 @@
 
 	if(flags&TJ_FASTUPSAMPLE) j->dinfo.do_fancy_upsampling=FALSE;
 	if(flags&TJ_YUV) j->dinfo.raw_data_out=TRUE;
+	else
+	{
+		j->dinfo.scale_num=scale_num;
+		j->dinfo.scale_denom=scale_denom;
+	}
 
 	jpeg_start_decompress(&j->dinfo);
 	if(flags&TJ_YUV)
@@ -567,6 +633,16 @@
 	}
 	else
 	{
+		if(pitch==0) pitch=j->dinfo.output_width*ps;
+		if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)
+			*j->dinfo.output_height))==NULL)
+			_throw("Memory allocation failed in tjInitDecompress()");
+		for(i=0; i<j->dinfo.output_height; i++)
+		{
+			if(flags&TJ_BOTTOMUP)
+				row_pointer[i]= &dstbuf[(j->dinfo.output_height-i-1)*pitch];
+			else row_pointer[i]= &dstbuf[i*pitch];
+		}
 		while(j->dinfo.output_scanline<j->dinfo.output_height)
 		{
 			jpeg_read_scanlines(&j->dinfo, &row_pointer[j->dinfo.output_scanline],
@@ -576,6 +652,7 @@
 	jpeg_finish_decompress(&j->dinfo);
 
 	bailout:
+	if(j->dinfo.global_state>DSTATE_START) jpeg_abort_decompress(&j->dinfo);
 	for(i=0; i<MAX_COMPONENTS; i++)
 	{
 		if(tmpbuf[i]) free(tmpbuf[i]);